org.apache.accumulo.master.Master.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.master.Master.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.master;

import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.impl.Namespaces;
import org.apache.accumulo.core.client.impl.Tables;
import org.apache.accumulo.core.client.impl.ThriftTransportPool;
import org.apache.accumulo.core.client.impl.thrift.TableOperation;
import org.apache.accumulo.core.client.impl.thrift.TableOperationExceptionType;
import org.apache.accumulo.core.client.impl.thrift.ThriftTableOperationException;
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.conf.SiteConfiguration;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.data.impl.KeyExtent;
import org.apache.accumulo.core.master.state.tables.TableState;
import org.apache.accumulo.core.master.thrift.BulkImportState;
import org.apache.accumulo.core.master.thrift.MasterClientService.Iface;
import org.apache.accumulo.core.master.thrift.MasterClientService.Processor;
import org.apache.accumulo.core.master.thrift.MasterGoalState;
import org.apache.accumulo.core.master.thrift.MasterMonitorInfo;
import org.apache.accumulo.core.master.thrift.MasterState;
import org.apache.accumulo.core.master.thrift.TableInfo;
import org.apache.accumulo.core.master.thrift.TabletServerStatus;
import org.apache.accumulo.core.metadata.MetadataTable;
import org.apache.accumulo.core.metadata.RootTable;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection;
import org.apache.accumulo.core.replication.ReplicationTable;
import org.apache.accumulo.core.replication.thrift.ReplicationCoordinator;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.security.NamespacePermission;
import org.apache.accumulo.core.security.TablePermission;
import org.apache.accumulo.core.tabletserver.thrift.TUnloadTabletGoal;
import org.apache.accumulo.core.trace.DistributedTrace;
import org.apache.accumulo.core.trace.thrift.TInfo;
import org.apache.accumulo.core.util.Daemon;
import org.apache.accumulo.core.util.Pair;
import org.apache.accumulo.core.zookeeper.ZooUtil;
import org.apache.accumulo.fate.AgeOffStore;
import org.apache.accumulo.fate.Fate;
import org.apache.accumulo.fate.zookeeper.IZooReaderWriter;
import org.apache.accumulo.fate.zookeeper.ZooLock.LockLossReason;
import org.apache.accumulo.fate.zookeeper.ZooUtil.NodeExistsPolicy;
import org.apache.accumulo.fate.zookeeper.ZooUtil.NodeMissingPolicy;
import org.apache.accumulo.master.metrics.MasterMetricsFactory;
import org.apache.accumulo.master.recovery.RecoveryManager;
import org.apache.accumulo.master.replication.MasterReplicationCoordinator;
import org.apache.accumulo.master.replication.ReplicationDriver;
import org.apache.accumulo.master.replication.WorkDriver;
import org.apache.accumulo.master.state.TableCounts;
import org.apache.accumulo.server.Accumulo;
import org.apache.accumulo.server.AccumuloServerContext;
import org.apache.accumulo.server.HighlyAvailableService;
import org.apache.accumulo.server.ServerConstants;
import org.apache.accumulo.server.ServerOpts;
import org.apache.accumulo.server.client.HdfsZooInstance;
import org.apache.accumulo.server.conf.ServerConfigurationFactory;
import org.apache.accumulo.server.fs.VolumeManager;
import org.apache.accumulo.server.fs.VolumeManager.FileType;
import org.apache.accumulo.server.fs.VolumeManagerImpl;
import org.apache.accumulo.server.init.Initialize;
import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.log.WalStateManager.WalMarkerException;
import org.apache.accumulo.server.master.LiveTServerSet;
import org.apache.accumulo.server.master.LiveTServerSet.TServerConnection;
import org.apache.accumulo.server.master.balancer.DefaultLoadBalancer;
import org.apache.accumulo.server.master.balancer.TabletBalancer;
import org.apache.accumulo.server.master.state.CurrentState;
import org.apache.accumulo.server.master.state.DeadServerList;
import org.apache.accumulo.server.master.state.MergeInfo;
import org.apache.accumulo.server.master.state.MergeState;
import org.apache.accumulo.server.master.state.MetaDataStateStore;
import org.apache.accumulo.server.master.state.RootTabletStateStore;
import org.apache.accumulo.server.master.state.TServerInstance;
import org.apache.accumulo.server.master.state.TabletLocationState;
import org.apache.accumulo.server.master.state.TabletMigration;
import org.apache.accumulo.server.master.state.TabletServerState;
import org.apache.accumulo.server.master.state.TabletState;
import org.apache.accumulo.server.master.state.ZooStore;
import org.apache.accumulo.server.master.state.ZooTabletStateStore;
import org.apache.accumulo.server.metrics.Metrics;
import org.apache.accumulo.server.replication.ZooKeeperInitialization;
import org.apache.accumulo.server.rpc.RpcWrapper;
import org.apache.accumulo.server.rpc.ServerAddress;
import org.apache.accumulo.server.rpc.HighlyAvailableServiceWrapper;
import org.apache.accumulo.server.rpc.TCredentialsUpdatingWrapper;
import org.apache.accumulo.server.rpc.TServerUtils;
import org.apache.accumulo.server.rpc.ThriftServerType;
import org.apache.accumulo.server.security.AuditedSecurityOperation;
import org.apache.accumulo.server.security.SecurityOperation;
import org.apache.accumulo.server.security.SecurityUtil;
import org.apache.accumulo.server.security.delegation.AuthenticationTokenKeyManager;
import org.apache.accumulo.server.security.delegation.AuthenticationTokenSecretManager;
import org.apache.accumulo.server.security.delegation.ZooAuthenticationKeyDistributor;
import org.apache.accumulo.server.security.handler.ZKPermHandler;
import org.apache.accumulo.server.tables.TableManager;
import org.apache.accumulo.server.tables.TableObserver;
import org.apache.accumulo.server.util.DefaultMap;
import org.apache.accumulo.server.util.Halt;
import org.apache.accumulo.server.util.MetadataTableUtil;
import org.apache.accumulo.server.util.ServerBulkImportStatus;
import org.apache.accumulo.server.util.TableInfoUtil;
import org.apache.accumulo.server.util.time.SimpleTimer;
import org.apache.accumulo.server.zookeeper.ZooLock;
import org.apache.accumulo.server.zookeeper.ZooReaderWriter;
import org.apache.accumulo.start.classloader.vfs.AccumuloVFSClassLoader;
import org.apache.accumulo.start.classloader.vfs.ContextManager;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.thrift.TException;
import org.apache.thrift.server.TServer;
import org.apache.thrift.transport.TTransportException;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoAuthException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Iterables;

/**
 * The Master is responsible for assigning and balancing tablets to tablet servers.
 *
 * The master will also coordinate log recoveries and reports general status.
 */
public class Master extends AccumuloServerContext
        implements LiveTServerSet.Listener, TableObserver, CurrentState, HighlyAvailableService {

    final static Logger log = LoggerFactory.getLogger(Master.class);

    final static int ONE_SECOND = 1000;
    final static long TIME_TO_WAIT_BETWEEN_SCANS = 60 * ONE_SECOND;
    final private static long TIME_BETWEEN_MIGRATION_CLEANUPS = 5 * 60 * ONE_SECOND;
    final static long WAIT_BETWEEN_ERRORS = ONE_SECOND;
    final private static long DEFAULT_WAIT_FOR_WATCHER = 10 * ONE_SECOND;
    final private static int MAX_CLEANUP_WAIT_TIME = ONE_SECOND;
    final private static int TIME_TO_WAIT_BETWEEN_LOCK_CHECKS = ONE_SECOND;
    final static int MAX_TSERVER_WORK_CHUNK = 5000;
    final private static int MAX_BAD_STATUS_COUNT = 3;

    final VolumeManager fs;
    final private String hostname;
    final private Object balancedNotifier = new Object();
    final LiveTServerSet tserverSet;
    final private List<TabletGroupWatcher> watchers = new ArrayList<>();
    final SecurityOperation security;
    final Map<TServerInstance, AtomicInteger> badServers = Collections
            .synchronizedMap(new DefaultMap<TServerInstance, AtomicInteger>(new AtomicInteger()));
    final Set<TServerInstance> serversToShutdown = Collections.synchronizedSet(new HashSet<TServerInstance>());
    final SortedMap<KeyExtent, TServerInstance> migrations = Collections
            .synchronizedSortedMap(new TreeMap<KeyExtent, TServerInstance>());
    final EventCoordinator nextEvent = new EventCoordinator();
    final private Object mergeLock = new Object();
    private ReplicationDriver replicationWorkDriver;
    private WorkDriver replicationWorkAssigner;
    RecoveryManager recoveryManager = null;
    private final MasterTime timeKeeper;

    // Delegation Token classes
    private final boolean delegationTokensAvailable;
    private ZooAuthenticationKeyDistributor keyDistributor;
    private AuthenticationTokenKeyManager authenticationTokenKeyManager;

    ZooLock masterLock = null;
    private TServer clientService = null;
    TabletBalancer tabletBalancer;

    private MasterState state = MasterState.INITIAL;

    Fate<Master> fate;

    volatile SortedMap<TServerInstance, TabletServerStatus> tserverStatus = Collections
            .unmodifiableSortedMap(new TreeMap<TServerInstance, TabletServerStatus>());
    final ServerBulkImportStatus bulkImportStatus = new ServerBulkImportStatus();

    @Override
    public synchronized MasterState getMasterState() {
        return state;
    }

    public boolean stillMaster() {
        return getMasterState() != MasterState.STOP;
    }

    static final boolean X = true;
    static final boolean O = false;
    // @formatter:off
    static final boolean transitionOK[][] = {
            //                              INITIAL HAVE_LOCK SAFE_MODE NORMAL UNLOAD_META UNLOAD_ROOT STOP
            /* INITIAL */ { X, X, O, O, O, O, X }, /* HAVE_LOCK */ { O, X, X, X, O, O, X },
            /* SAFE_MODE */ { O, O, X, X, X, O, X }, /* NORMAL */ { O, O, X, X, X, O, X },
            /* UNLOAD_METADATA_TABLETS */ { O, O, X, X, X, X, X }, /* UNLOAD_ROOT_TABLET */ { O, O, O, X, X, X, X },
            /* STOP */ { O, O, O, O, O, X, X } };

    //@formatter:on
    synchronized void setMasterState(MasterState newState) {
        if (state.equals(newState))
            return;
        if (!transitionOK[state.ordinal()][newState.ordinal()]) {
            log.error("Programmer error: master should not transition from " + state + " to " + newState);
        }
        MasterState oldState = state;
        state = newState;
        nextEvent.event("State changed from %s to %s", oldState, newState);
        if (newState == MasterState.STOP) {
            // Give the server a little time before shutdown so the client
            // thread requesting the stop can return
            SimpleTimer.getInstance(getConfiguration()).schedule(new Runnable() {
                @Override
                public void run() {
                    // This frees the main thread and will cause the master to exit
                    clientService.stop();
                    Master.this.nextEvent.event("stopped event loop");
                }

            }, 100l, 1000l);
        }

        if (oldState != newState && (newState == MasterState.HAVE_LOCK)) {
            upgradeZookeeper();
        }

        if (oldState != newState && (newState == MasterState.NORMAL)) {
            upgradeMetadata();
        }
    }

    private void moveRootTabletToRootTable(IZooReaderWriter zoo) throws Exception {
        String dirZPath = ZooUtil.getRoot(getInstance()) + RootTable.ZROOT_TABLET_PATH;

        if (!zoo.exists(dirZPath)) {
            Path oldPath = fs.getFullPath(FileType.TABLE, "/" + MetadataTable.ID + "/root_tablet");
            if (fs.exists(oldPath)) {
                String newPath = fs.choose(Optional.of(RootTable.ID), ServerConstants.getBaseUris())
                        + Constants.HDFS_TABLES_DIR + Path.SEPARATOR + RootTable.ID;
                fs.mkdirs(new Path(newPath));
                if (!fs.rename(oldPath, new Path(newPath))) {
                    throw new IOException("Failed to move root tablet from " + oldPath + " to " + newPath);
                }

                log.info("Upgrade renamed " + oldPath + " to " + newPath);
            }

            Path location = null;

            for (String basePath : ServerConstants.getTablesDirs()) {
                Path path = new Path(basePath + "/" + RootTable.ID + RootTable.ROOT_TABLET_LOCATION);
                if (fs.exists(path)) {
                    if (location != null) {
                        throw new IllegalStateException(
                                "Root table at multiple locations " + location + " " + path);
                    }

                    location = path;
                }
            }

            if (location == null)
                throw new IllegalStateException("Failed to find root tablet");

            log.info("Upgrade setting root table location in zookeeper " + location);
            zoo.putPersistentData(dirZPath, location.toString().getBytes(), NodeExistsPolicy.FAIL);
        }
    }

    private boolean haveUpgradedZooKeeper = false;

    private void upgradeZookeeper() {
        // 1.5.1 and 1.6.0 both do some state checking after obtaining the zoolock for the
        // monitor and before starting up. It's not tied to the data version at all (and would
        // introduce unnecessary complexity to try to make the master do it), but be aware
        // that the master is not the only thing that may alter zookeeper before starting.

        final int accumuloPersistentVersion = Accumulo.getAccumuloPersistentVersion(fs);
        if (Accumulo.persistentVersionNeedsUpgrade(accumuloPersistentVersion)) {
            // This Master hasn't started Fate yet, so any outstanding transactions must be from before the upgrade.
            // Change to Guava's Verify once we use Guava 17.
            if (null != fate) {
                throw new IllegalStateException(
                        "Access to Fate should not have been initialized prior to the Master transitioning to active. Please save all logs and file a bug.");
            }
            Accumulo.abortIfFateTransactions();
            try {
                log.info("Upgrading zookeeper");

                IZooReaderWriter zoo = ZooReaderWriter.getInstance();
                final String zooRoot = ZooUtil.getRoot(getInstance());

                log.debug("Handling updates for version " + accumuloPersistentVersion);

                log.debug("Cleaning out remnants of logger role.");
                zoo.recursiveDelete(zooRoot + "/loggers", NodeMissingPolicy.SKIP);
                zoo.recursiveDelete(zooRoot + "/dead/loggers", NodeMissingPolicy.SKIP);

                final byte[] zero = new byte[] { '0' };
                log.debug("Initializing recovery area.");
                zoo.putPersistentData(zooRoot + Constants.ZRECOVERY, zero, NodeExistsPolicy.SKIP);

                for (String id : zoo.getChildren(zooRoot + Constants.ZTABLES)) {
                    log.debug("Prepping table " + id + " for compaction cancellations.");
                    zoo.putPersistentData(
                            zooRoot + Constants.ZTABLES + "/" + id + Constants.ZTABLE_COMPACT_CANCEL_ID, zero,
                            NodeExistsPolicy.SKIP);
                }

                @SuppressWarnings("deprecation")
                String zpath = zooRoot + Constants.ZCONFIG + "/" + Property.TSERV_WAL_SYNC_METHOD.getKey();
                // is the entire instance set to use flushing vs sync?
                boolean flushDefault = false;
                try {
                    byte data[] = zoo.getData(zpath, null);
                    if (new String(data, UTF_8).endsWith("flush")) {
                        flushDefault = true;
                    }
                } catch (KeeperException.NoNodeException ex) {
                    // skip
                }
                for (String id : zoo.getChildren(zooRoot + Constants.ZTABLES)) {
                    log.debug("Converting table " + id + " WALog setting to Durability");
                    try {
                        @SuppressWarnings("deprecation")
                        String path = zooRoot + Constants.ZTABLES + "/" + id + Constants.ZTABLE_CONF + "/"
                                + Property.TABLE_WALOG_ENABLED.getKey();
                        byte[] data = zoo.getData(path, null);
                        boolean useWAL = Boolean.parseBoolean(new String(data, UTF_8));
                        zoo.recursiveDelete(path, NodeMissingPolicy.FAIL);
                        path = zooRoot + Constants.ZTABLES + "/" + id + Constants.ZTABLE_CONF + "/"
                                + Property.TABLE_DURABILITY.getKey();
                        if (useWAL) {
                            if (flushDefault) {
                                zoo.putPersistentData(path, "flush".getBytes(), NodeExistsPolicy.SKIP);
                            } else {
                                zoo.putPersistentData(path, "sync".getBytes(), NodeExistsPolicy.SKIP);
                            }
                        } else {
                            zoo.putPersistentData(path, "none".getBytes(), NodeExistsPolicy.SKIP);
                        }
                    } catch (KeeperException.NoNodeException ex) {
                        // skip it
                    }
                }

                // create initial namespaces
                String namespaces = ZooUtil.getRoot(getInstance()) + Constants.ZNAMESPACES;
                zoo.putPersistentData(namespaces, new byte[0], NodeExistsPolicy.SKIP);
                for (Pair<String, String> namespace : Iterables.concat(
                        Collections.singleton(
                                new Pair<>(Namespaces.ACCUMULO_NAMESPACE, Namespaces.ACCUMULO_NAMESPACE_ID)),
                        Collections.singleton(
                                new Pair<>(Namespaces.DEFAULT_NAMESPACE, Namespaces.DEFAULT_NAMESPACE_ID)))) {
                    String ns = namespace.getFirst();
                    String id = namespace.getSecond();
                    log.debug("Upgrade creating namespace \"" + ns + "\" (ID: " + id + ")");
                    if (!Namespaces.exists(getInstance(), id))
                        TableManager.prepareNewNamespaceState(getInstance().getInstanceID(), id, ns,
                                NodeExistsPolicy.SKIP);
                }

                // create replication table in zk
                log.debug("Upgrade creating table " + ReplicationTable.NAME + " (ID: " + ReplicationTable.ID + ")");
                TableManager.prepareNewTableState(getInstance().getInstanceID(), ReplicationTable.ID,
                        Namespaces.ACCUMULO_NAMESPACE_ID, ReplicationTable.NAME, TableState.OFFLINE,
                        NodeExistsPolicy.SKIP);

                // create root table
                log.debug("Upgrade creating table " + RootTable.NAME + " (ID: " + RootTable.ID + ")");
                TableManager.prepareNewTableState(getInstance().getInstanceID(), RootTable.ID,
                        Namespaces.ACCUMULO_NAMESPACE_ID, RootTable.NAME, TableState.ONLINE, NodeExistsPolicy.SKIP);
                Initialize.initSystemTablesConfig();
                // ensure root user can flush root table
                security.grantTablePermission(rpcCreds(), security.getRootUsername(), RootTable.ID,
                        TablePermission.ALTER_TABLE, Namespaces.ACCUMULO_NAMESPACE_ID);

                // put existing tables in the correct namespaces
                String tables = ZooUtil.getRoot(getInstance()) + Constants.ZTABLES;
                for (String tableId : zoo.getChildren(tables)) {
                    String targetNamespace = (MetadataTable.ID.equals(tableId) || RootTable.ID.equals(tableId))
                            ? Namespaces.ACCUMULO_NAMESPACE_ID
                            : Namespaces.DEFAULT_NAMESPACE_ID;
                    log.debug("Upgrade moving table "
                            + new String(zoo.getData(tables + "/" + tableId + Constants.ZTABLE_NAME, null), UTF_8)
                            + " (ID: " + tableId + ") into namespace with ID " + targetNamespace);
                    zoo.putPersistentData(tables + "/" + tableId + Constants.ZTABLE_NAMESPACE,
                            targetNamespace.getBytes(UTF_8), NodeExistsPolicy.SKIP);
                }

                // rename metadata table
                log.debug("Upgrade renaming table " + MetadataTable.OLD_NAME + " (ID: " + MetadataTable.ID + ") to "
                        + MetadataTable.NAME);
                zoo.putPersistentData(tables + "/" + MetadataTable.ID + Constants.ZTABLE_NAME,
                        Tables.qualify(MetadataTable.NAME).getSecond().getBytes(UTF_8), NodeExistsPolicy.OVERWRITE);

                moveRootTabletToRootTable(zoo);

                // add system namespace permissions to existing users
                ZKPermHandler perm = new ZKPermHandler();
                perm.initialize(getInstance().getInstanceID(), true);
                String users = ZooUtil.getRoot(getInstance()) + "/users";
                for (String user : zoo.getChildren(users)) {
                    zoo.putPersistentData(users + "/" + user + "/Namespaces", new byte[0], NodeExistsPolicy.SKIP);
                    perm.grantNamespacePermission(user, Namespaces.ACCUMULO_NAMESPACE_ID, NamespacePermission.READ);
                }
                perm.grantNamespacePermission("root", Namespaces.ACCUMULO_NAMESPACE_ID,
                        NamespacePermission.ALTER_TABLE);

                // add the currlog location for root tablet current logs
                zoo.putPersistentData(ZooUtil.getRoot(getInstance()) + RootTable.ZROOT_TABLET_CURRENT_LOGS,
                        new byte[0], NodeExistsPolicy.SKIP);

                // create tablet server wal logs node in ZK
                zoo.putPersistentData(ZooUtil.getRoot(getInstance()) + WalStateManager.ZWALS, new byte[0],
                        NodeExistsPolicy.SKIP);

                haveUpgradedZooKeeper = true;
            } catch (Exception ex) {
                // ACCUMULO-3651 Changed level to error and added FATAL to message for slf4j compatibility
                log.error("FATAL: Error performing upgrade", ex);
                System.exit(1);
            }
        }
    }

    private final AtomicBoolean upgradeMetadataRunning = new AtomicBoolean(false);
    private final CountDownLatch waitForMetadataUpgrade = new CountDownLatch(1);

    private final ServerConfigurationFactory serverConfig;

    private MasterClientServiceHandler clientHandler;

    private void upgradeMetadata() {
        // we make sure we're only doing the rest of this method once so that we can signal to other threads that an upgrade wasn't needed.
        if (upgradeMetadataRunning.compareAndSet(false, true)) {
            final int accumuloPersistentVersion = Accumulo.getAccumuloPersistentVersion(fs);
            if (Accumulo.persistentVersionNeedsUpgrade(accumuloPersistentVersion)) {
                // sanity check that we passed the Fate verification prior to ZooKeeper upgrade, and that Fate still hasn't been started.
                // Change both to use Guava's Verify once we use Guava 17.
                if (!haveUpgradedZooKeeper) {
                    throw new IllegalStateException(
                            "We should only attempt to upgrade Accumulo's metadata table if we've already upgraded ZooKeeper. Please save all logs and file a bug.");
                }
                if (null != fate) {
                    throw new IllegalStateException(
                            "Access to Fate should not have been initialized prior to the Master finishing upgrades. Please save all logs and file a bug.");
                }
                Runnable upgradeTask = new Runnable() {
                    int version = accumuloPersistentVersion;

                    @Override
                    public void run() {
                        try {
                            log.info("Starting to upgrade metadata table.");
                            if (version == ServerConstants.MOVE_DELETE_MARKERS - 1) {
                                log.info("Updating Delete Markers in metadata table for version 1.4");
                                MetadataTableUtil.moveMetaDeleteMarkersFrom14(Master.this);
                                version++;
                            }
                            if (version == ServerConstants.MOVE_TO_ROOT_TABLE - 1) {
                                log.info("Updating Delete Markers in metadata table.");
                                MetadataTableUtil.moveMetaDeleteMarkers(Master.this);
                                version++;
                            }
                            if (version == ServerConstants.MOVE_TO_REPLICATION_TABLE - 1) {
                                log.info("Updating metadata table with entries for the replication table");
                                MetadataTableUtil.createReplicationTable(Master.this);
                                version++;
                            }
                            log.info("Updating persistent data version.");
                            Accumulo.updateAccumuloVersion(fs, accumuloPersistentVersion);
                            log.info("Upgrade complete");
                            waitForMetadataUpgrade.countDown();
                        } catch (Exception ex) {
                            // ACCUMULO-3651 Changed level to error and added FATAL to message for slf4j compatibility
                            log.error("FATAL: Error performing upgrade", ex);
                            System.exit(1);
                        }

                    }
                };

                // need to run this in a separate thread because a lock is held that prevents metadata tablets from being assigned and this task writes to the
                // metadata table
                new Thread(upgradeTask).start();
            } else {
                waitForMetadataUpgrade.countDown();
            }
        }
    }

    private int assignedOrHosted(String tableId) {
        int result = 0;
        for (TabletGroupWatcher watcher : watchers) {
            TableCounts count = watcher.getStats(tableId);
            result += count.hosted() + count.assigned();
        }
        return result;
    }

    private int totalAssignedOrHosted() {
        int result = 0;
        for (TabletGroupWatcher watcher : watchers) {
            for (TableCounts counts : watcher.getStats().values()) {
                result += counts.assigned() + counts.hosted();
            }
        }
        return result;
    }

    private int nonMetaDataTabletsAssignedOrHosted() {
        return totalAssignedOrHosted() - assignedOrHosted(MetadataTable.ID) - assignedOrHosted(RootTable.ID);
    }

    private int notHosted() {
        int result = 0;
        for (TabletGroupWatcher watcher : watchers) {
            for (TableCounts counts : watcher.getStats().values()) {
                result += counts.assigned() + counts.assignedToDeadServers() + counts.suspended();
            }
        }
        return result;
    }

    // The number of unassigned tablets that should be assigned: displayed on the monitor page
    int displayUnassigned() {
        int result = 0;
        switch (getMasterState()) {
        case NORMAL:
            // Count offline tablets for online tables
            for (TabletGroupWatcher watcher : watchers) {
                TableManager manager = TableManager.getInstance();
                for (Entry<String, TableCounts> entry : watcher.getStats().entrySet()) {
                    String tableId = entry.getKey();
                    TableCounts counts = entry.getValue();
                    TableState tableState = manager.getTableState(tableId);
                    if (tableState != null && tableState.equals(TableState.ONLINE)) {
                        result += counts.unassigned() + counts.assignedToDeadServers() + counts.assigned()
                                + counts.suspended();
                    }
                }
            }
            break;
        case SAFE_MODE:
            // Count offline tablets for the metadata table
            for (TabletGroupWatcher watcher : watchers) {
                TableCounts counts = watcher.getStats(MetadataTable.ID);
                result += counts.unassigned() + counts.suspended();
            }
            break;
        case UNLOAD_METADATA_TABLETS:
        case UNLOAD_ROOT_TABLET:
            for (TabletGroupWatcher watcher : watchers) {
                TableCounts counts = watcher.getStats(MetadataTable.ID);
                result += counts.unassigned() + counts.suspended();
            }
            break;
        default:
            break;
        }
        return result;
    }

    public void mustBeOnline(final String tableId) throws ThriftTableOperationException {
        Tables.clearCache(getInstance());
        if (!Tables.getTableState(getInstance(), tableId).equals(TableState.ONLINE))
            throw new ThriftTableOperationException(tableId, null, TableOperation.MERGE,
                    TableOperationExceptionType.OFFLINE, "table is not online");
    }

    public Master(ServerConfigurationFactory config, VolumeManager fs, String hostname) throws IOException {
        super(config);
        this.serverConfig = config;
        this.fs = fs;
        this.hostname = hostname;

        AccumuloConfiguration aconf = serverConfig.getConfiguration();

        log.info("Version " + Constants.VERSION);
        log.info("Instance " + getInstance().getInstanceID());
        timeKeeper = new MasterTime(this);

        ThriftTransportPool.getInstance().setIdleTime(aconf.getTimeInMillis(Property.GENERAL_RPC_TIMEOUT));
        tserverSet = new LiveTServerSet(this, this);
        this.tabletBalancer = aconf.instantiateClassProperty(Property.MASTER_TABLET_BALANCER, TabletBalancer.class,
                new DefaultLoadBalancer());
        this.tabletBalancer.init(serverConfig);

        try {
            AccumuloVFSClassLoader.getContextManager().setContextConfig(
                    new ContextManager.DefaultContextsConfig(new Iterable<Entry<String, String>>() {
                        @Override
                        public Iterator<Entry<String, String>> iterator() {
                            return getConfiguration().iterator();
                        }
                    }));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        this.security = AuditedSecurityOperation.getInstance(this);

        // Create the secret manager (can generate and verify delegation tokens)
        final long tokenLifetime = aconf.getTimeInMillis(Property.GENERAL_DELEGATION_TOKEN_LIFETIME);
        setSecretManager(new AuthenticationTokenSecretManager(getInstance(), tokenLifetime));

        authenticationTokenKeyManager = null;
        keyDistributor = null;
        if (getConfiguration().getBoolean(Property.INSTANCE_RPC_SASL_ENABLED)) {
            // SASL is enabled, create the key distributor (ZooKeeper) and manager (generates/rolls secret keys)
            log.info("SASL is enabled, creating delegation token key manager and distributor");
            final long tokenUpdateInterval = aconf
                    .getTimeInMillis(Property.GENERAL_DELEGATION_TOKEN_UPDATE_INTERVAL);
            keyDistributor = new ZooAuthenticationKeyDistributor(ZooReaderWriter.getInstance(),
                    ZooUtil.getRoot(getInstance()) + Constants.ZDELEGATION_TOKEN_KEYS);
            authenticationTokenKeyManager = new AuthenticationTokenKeyManager(getSecretManager(), keyDistributor,
                    tokenUpdateInterval, tokenLifetime);
            delegationTokensAvailable = true;
        } else {
            log.info("SASL is not enabled, delegation tokens will not be available");
            delegationTokensAvailable = false;
        }

    }

    public TServerConnection getConnection(TServerInstance server) {
        return tserverSet.getConnection(server);
    }

    public MergeInfo getMergeInfo(String tableId) {
        synchronized (mergeLock) {
            try {
                String path = ZooUtil.getRoot(getInstance().getInstanceID()) + Constants.ZTABLES + "/" + tableId
                        + "/merge";
                if (!ZooReaderWriter.getInstance().exists(path))
                    return new MergeInfo();
                byte[] data = ZooReaderWriter.getInstance().getData(path, new Stat());
                DataInputBuffer in = new DataInputBuffer();
                in.reset(data, data.length);
                MergeInfo info = new MergeInfo();
                info.readFields(in);
                return info;
            } catch (KeeperException.NoNodeException ex) {
                log.info("Error reading merge state, it probably just finished");
                return new MergeInfo();
            } catch (Exception ex) {
                log.warn("Unexpected error reading merge state", ex);
                return new MergeInfo();
            }
        }
    }

    public void setMergeState(MergeInfo info, MergeState state)
            throws IOException, KeeperException, InterruptedException {
        synchronized (mergeLock) {
            String path = ZooUtil.getRoot(getInstance().getInstanceID()) + Constants.ZTABLES + "/"
                    + info.getExtent().getTableId() + "/merge";
            info.setState(state);
            if (state.equals(MergeState.NONE)) {
                ZooReaderWriter.getInstance().recursiveDelete(path, NodeMissingPolicy.SKIP);
            } else {
                DataOutputBuffer out = new DataOutputBuffer();
                try {
                    info.write(out);
                } catch (IOException ex) {
                    throw new RuntimeException("Unlikely", ex);
                }
                ZooReaderWriter.getInstance().putPersistentData(path, out.getData(),
                        state.equals(MergeState.STARTED) ? ZooUtil.NodeExistsPolicy.FAIL
                                : ZooUtil.NodeExistsPolicy.OVERWRITE);
            }
            mergeLock.notifyAll();
        }
        nextEvent.event("Merge state of %s set to %s", info.getExtent(), state);
    }

    public void clearMergeState(String tableId) throws IOException, KeeperException, InterruptedException {
        synchronized (mergeLock) {
            String path = ZooUtil.getRoot(getInstance().getInstanceID()) + Constants.ZTABLES + "/" + tableId
                    + "/merge";
            ZooReaderWriter.getInstance().recursiveDelete(path, NodeMissingPolicy.SKIP);
            mergeLock.notifyAll();
        }
        nextEvent.event("Merge state of %s cleared", tableId);
    }

    void setMasterGoalState(MasterGoalState state) {
        try {
            ZooReaderWriter.getInstance().putPersistentData(
                    ZooUtil.getRoot(getInstance()) + Constants.ZMASTER_GOAL_STATE, state.name().getBytes(),
                    NodeExistsPolicy.OVERWRITE);
        } catch (Exception ex) {
            log.error("Unable to set master goal state in zookeeper");
        }
    }

    MasterGoalState getMasterGoalState() {
        while (true)
            try {
                byte[] data = ZooReaderWriter.getInstance()
                        .getData(ZooUtil.getRoot(getInstance()) + Constants.ZMASTER_GOAL_STATE, null);
                return MasterGoalState.valueOf(new String(data));
            } catch (Exception e) {
                log.error("Problem getting real goal state from zookeeper: " + e);
                sleepUninterruptibly(1, TimeUnit.SECONDS);
            }
    }

    public boolean hasCycled(long time) {
        for (TabletGroupWatcher watcher : watchers) {
            if (watcher.stats.lastScanFinished() < time)
                return false;
        }

        return true;
    }

    public void clearMigrations(String tableId) {
        synchronized (migrations) {
            Iterator<KeyExtent> iterator = migrations.keySet().iterator();
            while (iterator.hasNext()) {
                KeyExtent extent = iterator.next();
                if (extent.getTableId().equals(tableId)) {
                    iterator.remove();
                }
            }
        }
    }

    static enum TabletGoalState {
        HOSTED(TUnloadTabletGoal.UNKNOWN), UNASSIGNED(TUnloadTabletGoal.UNASSIGNED), DELETED(
                TUnloadTabletGoal.DELETED), SUSPENDED(TUnloadTabletGoal.SUSPENDED);

        private final TUnloadTabletGoal unloadGoal;

        TabletGoalState(TUnloadTabletGoal unloadGoal) {
            this.unloadGoal = unloadGoal;
        }

        /** The purpose of unloading this tablet. */
        public TUnloadTabletGoal howUnload() {
            return unloadGoal;
        }
    };

    TabletGoalState getSystemGoalState(TabletLocationState tls) {
        switch (getMasterState()) {
        case NORMAL:
            return TabletGoalState.HOSTED;
        case HAVE_LOCK: // fall-through intended
        case INITIAL: // fall-through intended
        case SAFE_MODE:
            if (tls.extent.isMeta())
                return TabletGoalState.HOSTED;
            return TabletGoalState.UNASSIGNED;
        case UNLOAD_METADATA_TABLETS:
            if (tls.extent.isRootTablet())
                return TabletGoalState.HOSTED;
            return TabletGoalState.UNASSIGNED;
        case UNLOAD_ROOT_TABLET:
            return TabletGoalState.UNASSIGNED;
        case STOP:
            return TabletGoalState.UNASSIGNED;
        default:
            throw new IllegalStateException("Unknown Master State");
        }
    }

    TabletGoalState getTableGoalState(KeyExtent extent) {
        TableState tableState = TableManager.getInstance().getTableState(extent.getTableId());
        if (tableState == null)
            return TabletGoalState.DELETED;
        switch (tableState) {
        case DELETING:
            return TabletGoalState.DELETED;
        case OFFLINE:
        case NEW:
            return TabletGoalState.UNASSIGNED;
        default:
            return TabletGoalState.HOSTED;
        }
    }

    TabletGoalState getGoalState(TabletLocationState tls, MergeInfo mergeInfo) {
        KeyExtent extent = tls.extent;
        // Shutting down?
        TabletGoalState state = getSystemGoalState(tls);
        if (state == TabletGoalState.HOSTED) {
            if (tls.current != null && serversToShutdown.contains(tls.current)) {
                return TabletGoalState.SUSPENDED;
            }
            // Handle merge transitions
            if (mergeInfo.getExtent() != null) {
                log.debug("mergeInfo overlaps: " + extent + " " + mergeInfo.overlaps(extent));
                if (mergeInfo.overlaps(extent)) {
                    switch (mergeInfo.getState()) {
                    case NONE:
                    case COMPLETE:
                        break;
                    case STARTED:
                    case SPLITTING:
                        return TabletGoalState.HOSTED;
                    case WAITING_FOR_CHOPPED:
                        if (tls.getState(tserverSet.getCurrentServers()).equals(TabletState.HOSTED)) {
                            if (tls.chopped)
                                return TabletGoalState.UNASSIGNED;
                        } else {
                            if (tls.chopped && tls.walogs.isEmpty())
                                return TabletGoalState.UNASSIGNED;
                        }

                        return TabletGoalState.HOSTED;
                    case WAITING_FOR_OFFLINE:
                    case MERGING:
                        return TabletGoalState.UNASSIGNED;
                    }
                }
            }

            // taking table offline?
            state = getTableGoalState(extent);
            if (state == TabletGoalState.HOSTED) {
                // Maybe this tablet needs to be migrated
                TServerInstance dest = migrations.get(extent);
                if (dest != null && tls.current != null && !dest.equals(tls.current)) {
                    return TabletGoalState.UNASSIGNED;
                }
            }
        }
        return state;
    }

    private class MigrationCleanupThread extends Daemon {

        @Override
        public void run() {
            setName("Migration Cleanup Thread");
            while (stillMaster()) {
                if (!migrations.isEmpty()) {
                    try {
                        cleanupOfflineMigrations();
                        cleanupNonexistentMigrations(getConnector());
                    } catch (Exception ex) {
                        log.error("Error cleaning up migrations", ex);
                    }
                }
                sleepUninterruptibly(TIME_BETWEEN_MIGRATION_CLEANUPS, TimeUnit.MILLISECONDS);
            }
        }

        /**
         * If a migrating tablet splits, and the tablet dies before sending the master a message, the migration will refer to a non-existing tablet, so it can never
         * complete. Periodically scan the metadata table and remove any migrating tablets that no longer exist.
         */
        private void cleanupNonexistentMigrations(final Connector connector)
                throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
            Scanner scanner = connector.createScanner(MetadataTable.NAME, Authorizations.EMPTY);
            TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
            Set<KeyExtent> found = new HashSet<>();
            for (Entry<Key, Value> entry : scanner) {
                KeyExtent extent = new KeyExtent(entry.getKey().getRow(), entry.getValue());
                if (migrations.containsKey(extent)) {
                    found.add(extent);
                }
            }
            migrations.keySet().retainAll(found);
        }

        /**
         * If migrating a tablet for a table that is offline, the migration can never succeed because no tablet server will load the tablet. check for offline
         * tables and remove their migrations.
         */
        private void cleanupOfflineMigrations() {
            TableManager manager = TableManager.getInstance();
            for (String tableId : Tables.getIdToNameMap(getInstance()).keySet()) {
                TableState state = manager.getTableState(tableId);
                if (TableState.OFFLINE == state) {
                    clearMigrations(tableId);
                }
            }
        }
    }

    private class StatusThread extends Daemon {

        private boolean goodStats() {
            int start;
            switch (getMasterState()) {
            case UNLOAD_METADATA_TABLETS:
                start = 1;
                break;
            case UNLOAD_ROOT_TABLET:
                start = 2;
                break;
            default:
                start = 0;
            }
            for (int i = start; i < watchers.size(); i++) {
                TabletGroupWatcher watcher = watchers.get(i);
                if (watcher.stats.getLastMasterState() != getMasterState()) {
                    log.debug(watcher.getName() + ": " + watcher.stats.getLastMasterState() + " != "
                            + getMasterState());
                    return false;
                }
            }
            return true;
        }

        @Override
        public void run() {
            setName("Status Thread");
            EventCoordinator.Listener eventListener = nextEvent.getListener();
            while (stillMaster()) {
                long wait = DEFAULT_WAIT_FOR_WATCHER;
                try {
                    switch (getMasterGoalState()) {
                    case NORMAL:
                        setMasterState(MasterState.NORMAL);
                        break;
                    case SAFE_MODE:
                        if (getMasterState() == MasterState.NORMAL) {
                            setMasterState(MasterState.SAFE_MODE);
                        }
                        if (getMasterState() == MasterState.HAVE_LOCK) {
                            setMasterState(MasterState.SAFE_MODE);
                        }
                        break;
                    case CLEAN_STOP:
                        switch (getMasterState()) {
                        case NORMAL:
                            setMasterState(MasterState.SAFE_MODE);
                            break;
                        case SAFE_MODE: {
                            int count = nonMetaDataTabletsAssignedOrHosted();
                            log.debug(String.format("There are %d non-metadata tablets assigned or hosted", count));
                            if (count == 0 && goodStats())
                                setMasterState(MasterState.UNLOAD_METADATA_TABLETS);
                        }
                            break;
                        case UNLOAD_METADATA_TABLETS: {
                            int count = assignedOrHosted(MetadataTable.ID);
                            log.debug(String.format("There are %d metadata tablets assigned or hosted", count));
                            if (count == 0 && goodStats())
                                setMasterState(MasterState.UNLOAD_ROOT_TABLET);
                        }
                            break;
                        case UNLOAD_ROOT_TABLET: {
                            int count = assignedOrHosted(MetadataTable.ID);
                            if (count > 0 && goodStats()) {
                                log.debug(String.format("%d metadata tablets online", count));
                                setMasterState(MasterState.UNLOAD_ROOT_TABLET);
                            }
                            int root_count = assignedOrHosted(RootTable.ID);
                            if (root_count > 0 && goodStats())
                                log.debug("The root tablet is still assigned or hosted");
                            if (count + root_count == 0 && goodStats()) {
                                Set<TServerInstance> currentServers = tserverSet.getCurrentServers();
                                log.debug("stopping " + currentServers.size() + " tablet servers");
                                for (TServerInstance server : currentServers) {
                                    try {
                                        serversToShutdown.add(server);
                                        tserverSet.getConnection(server).fastHalt(masterLock);
                                    } catch (TException e) {
                                        // its probably down, and we don't care
                                    } finally {
                                        tserverSet.remove(server);
                                    }
                                }
                                if (currentServers.size() == 0)
                                    setMasterState(MasterState.STOP);
                            }
                        }
                            break;
                        default:
                            break;
                        }
                    }
                } catch (Throwable t) {
                    log.error(
                            "Error occurred reading / switching master goal state. Will continue with attempt to update status",
                            t);
                }

                try {
                    wait = updateStatus();
                    eventListener.waitForEvents(wait);
                } catch (Throwable t) {
                    log.error("Error balancing tablets, will wait for " + WAIT_BETWEEN_ERRORS / ONE_SECOND
                            + " (seconds) and then retry", t);
                    sleepUninterruptibly(WAIT_BETWEEN_ERRORS, TimeUnit.MILLISECONDS);
                }
            }
        }

        private long updateStatus() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
            Set<TServerInstance> currentServers = tserverSet.getCurrentServers();
            tserverStatus = Collections.synchronizedSortedMap(gatherTableInformation(currentServers));
            checkForHeldServer(tserverStatus);

            if (!badServers.isEmpty()) {
                log.debug("not balancing because the balance information is out-of-date " + badServers.keySet());
            } else if (notHosted() > 0) {
                log.debug("not balancing because there are unhosted tablets: " + notHosted());
            } else if (getMasterGoalState() == MasterGoalState.CLEAN_STOP) {
                log.debug("not balancing because the master is attempting to stop cleanly");
            } else if (!serversToShutdown.isEmpty()) {
                log.debug("not balancing while shutting down servers " + serversToShutdown);
            } else {
                for (TabletGroupWatcher tgw : watchers) {
                    if (!tgw.isSameTserversAsLastScan(currentServers)) {
                        log.debug("not balancing just yet, as collection of live tservers is in flux");
                        return DEFAULT_WAIT_FOR_WATCHER;
                    }
                }
                return balanceTablets();
            }
            return DEFAULT_WAIT_FOR_WATCHER;
        }

        private void checkForHeldServer(SortedMap<TServerInstance, TabletServerStatus> tserverStatus) {
            TServerInstance instance = null;
            int crazyHoldTime = 0;
            int someHoldTime = 0;
            final long maxWait = getConfiguration().getTimeInMillis(Property.TSERV_HOLD_TIME_SUICIDE);
            for (Entry<TServerInstance, TabletServerStatus> entry : tserverStatus.entrySet()) {
                if (entry.getValue().getHoldTime() > 0) {
                    someHoldTime++;
                    if (entry.getValue().getHoldTime() > maxWait) {
                        instance = entry.getKey();
                        crazyHoldTime++;
                    }
                }
            }
            if (crazyHoldTime == 1 && someHoldTime == 1 && tserverStatus.size() > 1) {
                log.warn("Tablet server " + instance + " exceeded maximum hold time: attempting to kill it");
                try {
                    TServerConnection connection = tserverSet.getConnection(instance);
                    if (connection != null)
                        connection.fastHalt(masterLock);
                } catch (TException e) {
                    log.error("{}", e.getMessage(), e);
                }
                tserverSet.remove(instance);
            }
        }

        private long balanceTablets() {
            List<TabletMigration> migrationsOut = new ArrayList<>();
            long wait = tabletBalancer.balance(Collections.unmodifiableSortedMap(tserverStatus),
                    migrationsSnapshot(), migrationsOut);

            for (TabletMigration m : TabletBalancer.checkMigrationSanity(tserverStatus.keySet(), migrationsOut)) {
                if (migrations.containsKey(m.tablet)) {
                    log.warn("balancer requested migration more than once, skipping " + m);
                    continue;
                }
                migrations.put(m.tablet, m.newServer);
                log.debug("migration " + m);
            }
            if (migrationsOut.size() > 0) {
                nextEvent.event("Migrating %d more tablets, %d total", migrationsOut.size(), migrations.size());
            } else {
                synchronized (balancedNotifier) {
                    balancedNotifier.notifyAll();
                }
            }
            return wait;
        }

    }

    private SortedMap<TServerInstance, TabletServerStatus> gatherTableInformation(
            Set<TServerInstance> currentServers) {
        long start = System.currentTimeMillis();
        int threads = Math.max(getConfiguration().getCount(Property.MASTER_STATUS_THREAD_POOL_SIZE), 1);
        ExecutorService tp = Executors.newFixedThreadPool(threads);
        final SortedMap<TServerInstance, TabletServerStatus> result = new TreeMap<>();
        for (TServerInstance serverInstance : currentServers) {
            final TServerInstance server = serverInstance;
            tp.submit(new Runnable() {
                @Override
                public void run() {
                    try {
                        Thread t = Thread.currentThread();
                        String oldName = t.getName();
                        try {
                            t.setName("Getting status from " + server);
                            TServerConnection connection = tserverSet.getConnection(server);
                            if (connection == null)
                                throw new IOException("No connection to " + server);
                            TabletServerStatus status = connection.getTableMap(false);
                            result.put(server, status);
                        } finally {
                            t.setName(oldName);
                        }
                    } catch (Exception ex) {
                        log.error("unable to get tablet server status " + server + " " + ex.toString());
                        log.debug("unable to get tablet server status " + server, ex);
                        if (badServers.get(server).incrementAndGet() > MAX_BAD_STATUS_COUNT) {
                            log.warn("attempting to stop " + server);
                            try {
                                TServerConnection connection = tserverSet.getConnection(server);
                                if (connection != null) {
                                    connection.halt(masterLock);
                                }
                            } catch (TTransportException e) {
                                // ignore: it's probably down
                            } catch (Exception e) {
                                log.info("error talking to troublesome tablet server ", e);
                            }
                            badServers.remove(server);
                        }
                    }
                }
            });
        }
        tp.shutdown();
        try {
            tp.awaitTermination(getConfiguration().getTimeInMillis(Property.TSERV_CLIENT_TIMEOUT) * 2,
                    TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            log.debug("Interrupted while fetching status");
        }
        synchronized (badServers) {
            badServers.keySet().retainAll(currentServers);
            badServers.keySet().removeAll(result.keySet());
        }
        log.debug(String.format("Finished gathering information from %d servers in %.2f seconds", result.size(),
                (System.currentTimeMillis() - start) / 1000.));
        return result;
    }

    public void run() throws IOException, InterruptedException, KeeperException {
        final String zroot = ZooUtil.getRoot(getInstance());

        // ACCUMULO-4424 Put up the Thrift servers before getting the lock as a sign of process health when a hot-standby
        //
        // Start the Master's Client service
        clientHandler = new MasterClientServiceHandler(this);
        // Ensure that calls before the master gets the lock fail
        Iface haProxy = HighlyAvailableServiceWrapper.service(clientHandler, this);
        Iface rpcProxy = RpcWrapper.service(haProxy, new Processor<Iface>(clientHandler));
        final Processor<Iface> processor;
        if (ThriftServerType.SASL == getThriftServerType()) {
            Iface tcredsProxy = TCredentialsUpdatingWrapper.service(rpcProxy, clientHandler.getClass(),
                    getConfiguration());
            processor = new Processor<>(tcredsProxy);
        } else {
            processor = new Processor<>(rpcProxy);
        }
        ServerAddress sa = TServerUtils.startServer(this, hostname, Property.MASTER_CLIENTPORT, processor, "Master",
                "Master Client Service Handler", null, Property.MASTER_MINTHREADS, Property.MASTER_THREADCHECK,
                Property.GENERAL_MAX_MESSAGE_SIZE);
        clientService = sa.server;
        log.info("Started Master client service at {}", sa.address);

        // Start the replication coordinator which assigns tservers to service replication requests
        MasterReplicationCoordinator impl = new MasterReplicationCoordinator(this);
        ReplicationCoordinator.Iface haReplicationProxy = HighlyAvailableServiceWrapper.service(impl, this);
        ReplicationCoordinator.Processor<ReplicationCoordinator.Iface> replicationCoordinatorProcessor = new ReplicationCoordinator.Processor<>(
                RpcWrapper.service(impl, new ReplicationCoordinator.Processor<>(haReplicationProxy)));
        ServerAddress replAddress = TServerUtils.startServer(this, hostname,
                Property.MASTER_REPLICATION_COORDINATOR_PORT, replicationCoordinatorProcessor,
                "Master Replication Coordinator", "Replication Coordinator", null,
                Property.MASTER_REPLICATION_COORDINATOR_MINTHREADS,
                Property.MASTER_REPLICATION_COORDINATOR_THREADCHECK, Property.GENERAL_MAX_MESSAGE_SIZE);

        log.info("Started replication coordinator service at " + replAddress.address);

        // block until we can obtain the ZK lock for the master
        getMasterLock(zroot + Constants.ZMASTER_LOCK);

        recoveryManager = new RecoveryManager(this);

        TableManager.getInstance().addObserver(this);

        StatusThread statusThread = new StatusThread();
        statusThread.start();

        MigrationCleanupThread migrationCleanupThread = new MigrationCleanupThread();
        migrationCleanupThread.start();

        tserverSet.startListeningForTabletServerChanges();

        ZooReaderWriter zReaderWriter = ZooReaderWriter.getInstance();

        zReaderWriter.getChildren(zroot + Constants.ZRECOVERY, new Watcher() {
            @Override
            public void process(WatchedEvent event) {
                nextEvent.event("Noticed recovery changes", event.getType());
                try {
                    // watcher only fires once, add it back
                    ZooReaderWriter.getInstance().getChildren(zroot + Constants.ZRECOVERY, this);
                } catch (Exception e) {
                    log.error("Failed to add log recovery watcher back", e);
                }
            }
        });

        watchers.add(new TabletGroupWatcher(this, new MetaDataStateStore(this, this), null) {
            @Override
            boolean canSuspendTablets() {
                // Always allow user data tablets to enter suspended state.
                return true;
            }
        });

        watchers.add(new TabletGroupWatcher(this, new RootTabletStateStore(this, this), watchers.get(0)) {
            @Override
            boolean canSuspendTablets() {
                // Allow metadata tablets to enter suspended state only if so configured. Generally we'll want metadata tablets to
                // be immediately reassigned, even if there's a global table.suspension.duration setting.
                return getConfiguration().getBoolean(Property.MASTER_METADATA_SUSPENDABLE);
            }
        });

        watchers.add(new TabletGroupWatcher(this, new ZooTabletStateStore(new ZooStore(zroot)), watchers.get(1)) {
            @Override
            boolean canSuspendTablets() {
                // Never allow root tablet to enter suspended state.
                return false;
            }
        });
        for (TabletGroupWatcher watcher : watchers) {
            watcher.start();
        }

        // Once we are sure the upgrade is complete, we can safely allow fate use.
        waitForMetadataUpgrade.await();

        try {
            final AgeOffStore<Master> store = new AgeOffStore<>(
                    new org.apache.accumulo.fate.ZooStore<Master>(ZooUtil.getRoot(getInstance()) + Constants.ZFATE,
                            ZooReaderWriter.getInstance()),
                    1000 * 60 * 60 * 8);

            int threads = getConfiguration().getCount(Property.MASTER_FATE_THREADPOOL_SIZE);

            fate = new Fate<>(this, store);
            fate.startTransactionRunners(threads);

            SimpleTimer.getInstance(getConfiguration()).schedule(new Runnable() {

                @Override
                public void run() {
                    store.ageOff();
                }
            }, 63000, 63000);
        } catch (KeeperException e) {
            throw new IOException(e);
        } catch (InterruptedException e) {
            throw new IOException(e);
        }

        ZooKeeperInitialization.ensureZooKeeperInitialized(zReaderWriter, zroot);

        // Make sure that we have a secret key (either a new one or an old one from ZK) before we start
        // the master client service.
        if (null != authenticationTokenKeyManager && null != keyDistributor) {
            log.info("Starting delegation-token key manager");
            keyDistributor.initialize();
            authenticationTokenKeyManager.start();
            boolean logged = false;
            while (!authenticationTokenKeyManager.isInitialized()) {
                // Print out a status message when we start waiting for the key manager to get initialized
                if (!logged) {
                    log.info("Waiting for AuthenticationTokenKeyManager to be initialized");
                    logged = true;
                }
                sleepUninterruptibly(200, TimeUnit.MILLISECONDS);
            }
            // And log when we are initialized
            log.info("AuthenticationTokenSecretManager is initialized");
        }

        String address = sa.address.toString();
        log.info("Setting master lock data to " + address);
        masterLock.replaceLockData(address.getBytes());

        while (!clientService.isServing()) {
            sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
        }

        // Start the daemon to scan the replication table and make units of work
        replicationWorkDriver = new ReplicationDriver(this);
        replicationWorkDriver.start();

        // Start the daemon to assign work to tservers to replicate to our peers
        try {
            replicationWorkAssigner = new WorkDriver(this);
        } catch (AccumuloException | AccumuloSecurityException e) {
            log.error("Caught exception trying to initialize replication WorkDriver", e);
            throw new RuntimeException(e);
        }
        replicationWorkAssigner.start();

        // Advertise that port we used so peers don't have to be told what it is
        ZooReaderWriter.getInstance().putPersistentData(
                ZooUtil.getRoot(getInstance()) + Constants.ZMASTER_REPLICATION_COORDINATOR_ADDR,
                replAddress.address.toString().getBytes(UTF_8), NodeExistsPolicy.OVERWRITE);

        // Register replication metrics
        MasterMetricsFactory factory = new MasterMetricsFactory(getConfiguration(), this);
        Metrics replicationMetrics = factory.createReplicationMetrics();
        try {
            replicationMetrics.register();
        } catch (Exception e) {
            log.error("Failed to register replication metrics", e);
        }

        while (clientService.isServing()) {
            sleepUninterruptibly(500, TimeUnit.MILLISECONDS);
        }
        log.info("Shutting down fate.");
        fate.shutdown();

        log.info("Shutting down timekeeping.");
        timeKeeper.shutdown();

        final long deadline = System.currentTimeMillis() + MAX_CLEANUP_WAIT_TIME;
        statusThread.join(remaining(deadline));
        replicationWorkAssigner.join(remaining(deadline));
        replicationWorkDriver.join(remaining(deadline));
        replAddress.server.stop();
        // Signal that we want it to stop, and wait for it to do so.
        if (authenticationTokenKeyManager != null) {
            authenticationTokenKeyManager.gracefulStop();
            authenticationTokenKeyManager.join(remaining(deadline));
        }

        // quit, even if the tablet servers somehow jam up and the watchers
        // don't stop
        for (TabletGroupWatcher watcher : watchers) {
            watcher.join(remaining(deadline));
        }
        log.info("exiting");
    }

    private long remaining(long deadline) {
        return Math.max(1, deadline - System.currentTimeMillis());
    }

    public ZooLock getMasterLock() {
        return masterLock;
    }

    private static class MasterLockWatcher implements ZooLock.AsyncLockWatcher {

        boolean acquiredLock = false;
        boolean failedToAcquireLock = false;

        @Override
        public void lostLock(LockLossReason reason) {
            Halt.halt("Master lock in zookeeper lost (reason = " + reason + "), exiting!", -1);
        }

        @Override
        public void unableToMonitorLockNode(final Throwable e) {
            // ACCUMULO-3651 Changed level to error and added FATAL to message for slf4j compatibility
            Halt.halt(-1, new Runnable() {
                @Override
                public void run() {
                    log.error("FATAL: No longer able to monitor master lock node", e);
                }
            });

        }

        @Override
        public synchronized void acquiredLock() {
            log.debug("Acquired master lock");

            if (acquiredLock || failedToAcquireLock) {
                Halt.halt("Zoolock in unexpected state AL " + acquiredLock + " " + failedToAcquireLock, -1);
            }

            acquiredLock = true;
            notifyAll();
        }

        @Override
        public synchronized void failedToAcquireLock(Exception e) {
            log.warn("Failed to get master lock " + e);

            if (e instanceof NoAuthException) {
                String msg = "Failed to acquire master lock due to incorrect ZooKeeper authentication.";
                log.error(msg + " Ensure instance.secret is consistent across Accumulo configuration", e);
                Halt.halt(msg, -1);
            }

            if (acquiredLock) {
                Halt.halt("Zoolock in unexpected state FAL " + acquiredLock + " " + failedToAcquireLock, -1);
            }

            failedToAcquireLock = true;
            notifyAll();
        }

        public synchronized void waitForChange() {
            while (!acquiredLock && !failedToAcquireLock) {
                try {
                    wait();
                } catch (InterruptedException e) {
                }
            }
        }
    }

    private void getMasterLock(final String zMasterLoc) throws KeeperException, InterruptedException {
        log.info("trying to get master lock");

        final String masterClientAddress = hostname + ":"
                + getConfiguration().getPort(Property.MASTER_CLIENTPORT)[0];

        while (true) {

            MasterLockWatcher masterLockWatcher = new MasterLockWatcher();
            masterLock = new ZooLock(zMasterLoc);
            masterLock.lockAsync(masterLockWatcher, masterClientAddress.getBytes());

            masterLockWatcher.waitForChange();

            if (masterLockWatcher.acquiredLock) {
                break;
            }

            if (!masterLockWatcher.failedToAcquireLock) {
                throw new IllegalStateException("master lock in unknown state");
            }

            masterLock.tryToCancelAsyncLockOrUnlock();

            sleepUninterruptibly(TIME_TO_WAIT_BETWEEN_LOCK_CHECKS, TimeUnit.MILLISECONDS);
        }

        setMasterState(MasterState.HAVE_LOCK);
    }

    public static void main(String[] args) throws Exception {
        try {
            final String app = "master";
            Accumulo.setupLogging(app);
            SecurityUtil.serverLogin(SiteConfiguration.getInstance());

            ServerOpts opts = new ServerOpts();
            opts.parseArgs(app, args);
            String hostname = opts.getAddress();
            ServerConfigurationFactory conf = new ServerConfigurationFactory(HdfsZooInstance.getInstance());
            VolumeManager fs = VolumeManagerImpl.get();
            Accumulo.init(fs, conf, app);
            Master master = new Master(conf, fs, hostname);
            DistributedTrace.enable(hostname, app, conf.getConfiguration());
            master.run();
        } catch (Exception ex) {
            log.error("Unexpected exception, exiting", ex);
            System.exit(1);
        } finally {
            DistributedTrace.disable();
        }
    }

    @Override
    public void update(LiveTServerSet current, Set<TServerInstance> deleted, Set<TServerInstance> added) {
        DeadServerList obit = new DeadServerList(ZooUtil.getRoot(getInstance()) + Constants.ZDEADTSERVERS);
        if (added.size() > 0) {
            log.info("New servers: " + added);
            for (TServerInstance up : added)
                obit.delete(up.hostPort());
        }
        for (TServerInstance dead : deleted) {
            String cause = "unexpected failure";
            if (serversToShutdown.contains(dead))
                cause = "clean shutdown"; // maybe an incorrect assumption
            if (!getMasterGoalState().equals(MasterGoalState.CLEAN_STOP))
                obit.post(dead.hostPort(), cause);
        }

        Set<TServerInstance> unexpected = new HashSet<>(deleted);
        unexpected.removeAll(this.serversToShutdown);
        if (unexpected.size() > 0) {
            if (stillMaster() && !getMasterGoalState().equals(MasterGoalState.CLEAN_STOP)) {
                log.warn("Lost servers " + unexpected);
            }
        }
        serversToShutdown.removeAll(deleted);
        badServers.keySet().removeAll(deleted);
        // clear out any bad server with the same host/port as a new server
        synchronized (badServers) {
            cleanListByHostAndPort(badServers.keySet(), deleted, added);
        }
        synchronized (serversToShutdown) {
            cleanListByHostAndPort(serversToShutdown, deleted, added);
        }

        synchronized (migrations) {
            Iterator<Entry<KeyExtent, TServerInstance>> iter = migrations.entrySet().iterator();
            while (iter.hasNext()) {
                Entry<KeyExtent, TServerInstance> entry = iter.next();
                if (deleted.contains(entry.getValue())) {
                    log.info("Canceling migration of " + entry.getKey() + " to " + entry.getValue());
                    iter.remove();
                }
            }
        }
        nextEvent.event("There are now %d tablet servers", current.size());
    }

    private static void cleanListByHostAndPort(Collection<TServerInstance> badServers, Set<TServerInstance> deleted,
            Set<TServerInstance> added) {
        Iterator<TServerInstance> badIter = badServers.iterator();
        while (badIter.hasNext()) {
            TServerInstance bad = badIter.next();
            for (TServerInstance add : added) {
                if (bad.hostPort().equals(add.hostPort())) {
                    badIter.remove();
                    break;
                }
            }
            for (TServerInstance del : deleted) {
                if (bad.hostPort().equals(del.hostPort())) {
                    badIter.remove();
                    break;
                }
            }
        }
    }

    @Override
    public void stateChanged(String tableId, TableState state) {
        nextEvent.event("Table state in zookeeper changed for %s to %s", tableId, state);
        if (TableState.OFFLINE == state) {
            clearMigrations(tableId);
        }
    }

    @Override
    public void initialize(Map<String, TableState> tableIdToStateMap) {
    }

    @Override
    public void sessionExpired() {
    }

    @Override
    public Set<String> onlineTables() {
        Set<String> result = new HashSet<>();
        if (getMasterState() != MasterState.NORMAL) {
            if (getMasterState() != MasterState.UNLOAD_METADATA_TABLETS)
                result.add(MetadataTable.ID);
            if (getMasterState() != MasterState.UNLOAD_ROOT_TABLET)
                result.add(RootTable.ID);
            return result;
        }
        TableManager manager = TableManager.getInstance();

        for (String tableId : Tables.getIdToNameMap(getInstance()).keySet()) {
            TableState state = manager.getTableState(tableId);
            if (state != null) {
                if (state == TableState.ONLINE)
                    result.add(tableId);
            }
        }
        return result;
    }

    @Override
    public Set<TServerInstance> onlineTabletServers() {
        return tserverSet.getCurrentServers();
    }

    @Override
    public Collection<MergeInfo> merges() {
        List<MergeInfo> result = new ArrayList<>();
        for (String tableId : Tables.getIdToNameMap(getInstance()).keySet()) {
            result.add(getMergeInfo(tableId));
        }
        return result;
    }

    // recovers state from the persistent transaction to shutdown a server
    public void shutdownTServer(TServerInstance server) {
        nextEvent.event("Tablet Server shutdown requested for %s", server);
        serversToShutdown.add(server);
    }

    public EventCoordinator getEventCoordinator() {
        return nextEvent;
    }

    public ServerConfigurationFactory getConfigurationFactory() {
        return serverConfig;
    }

    public VolumeManager getFileSystem() {
        return this.fs;
    }

    public void assignedTablet(KeyExtent extent) {
        if (extent.isMeta()) {
            if (getMasterState().equals(MasterState.UNLOAD_ROOT_TABLET)) {
                setMasterState(MasterState.UNLOAD_METADATA_TABLETS);
            }
        }
        if (extent.isRootTablet()) {
            // probably too late, but try anyhow
            if (getMasterState().equals(MasterState.STOP)) {
                setMasterState(MasterState.UNLOAD_ROOT_TABLET);
            }
        }
    }

    public void waitForBalance(TInfo tinfo) {
        synchronized (balancedNotifier) {
            long eventCounter;
            do {
                eventCounter = nextEvent.waitForEvents(0, 0);
                try {
                    balancedNotifier.wait();
                } catch (InterruptedException e) {
                    log.debug(e.toString(), e);
                }
            } while (displayUnassigned() > 0 || migrations.size() > 0
                    || eventCounter != nextEvent.waitForEvents(0, 0));
        }
    }

    public MasterMonitorInfo getMasterMonitorInfo() {
        final MasterMonitorInfo result = new MasterMonitorInfo();

        result.tServerInfo = new ArrayList<>();
        result.tableMap = new DefaultMap<>(new TableInfo());
        for (Entry<TServerInstance, TabletServerStatus> serverEntry : tserverStatus.entrySet()) {
            final TabletServerStatus status = serverEntry.getValue();
            result.tServerInfo.add(status);
            for (Entry<String, TableInfo> entry : status.tableMap.entrySet()) {
                TableInfoUtil.add(result.tableMap.get(entry.getKey()), entry.getValue());
            }
        }
        result.badTServers = new HashMap<>();
        synchronized (badServers) {
            for (TServerInstance bad : badServers.keySet()) {
                result.badTServers.put(bad.hostPort(), TabletServerState.UNRESPONSIVE.getId());
            }
        }
        result.state = getMasterState();
        result.goalState = getMasterGoalState();
        result.unassignedTablets = displayUnassigned();
        result.serversShuttingDown = new HashSet<>();
        synchronized (serversToShutdown) {
            for (TServerInstance server : serversToShutdown)
                result.serversShuttingDown.add(server.hostPort());
        }
        DeadServerList obit = new DeadServerList(ZooUtil.getRoot(getInstance()) + Constants.ZDEADTSERVERS);
        result.deadTabletServers = obit.getList();
        result.bulkImports = bulkImportStatus.getBulkLoadStatus();
        return result;
    }

    /**
     * Can delegation tokens be generated for users
     */
    public boolean delegationTokensAvailable() {
        return delegationTokensAvailable;
    }

    @Override
    public Set<KeyExtent> migrationsSnapshot() {
        Set<KeyExtent> migrationKeys = new HashSet<>();
        synchronized (migrations) {
            migrationKeys.addAll(migrations.keySet());
        }
        return Collections.unmodifiableSet(migrationKeys);
    }

    @Override
    public Set<TServerInstance> shutdownServers() {
        synchronized (serversToShutdown) {
            return new HashSet<>(serversToShutdown);
        }
    }

    public void markDeadServerLogsAsClosed(Map<TServerInstance, List<Path>> logsForDeadServers)
            throws WalMarkerException {
        WalStateManager mgr = new WalStateManager(this.inst, ZooReaderWriter.getInstance());
        for (Entry<TServerInstance, List<Path>> server : logsForDeadServers.entrySet()) {
            for (Path path : server.getValue()) {
                mgr.closeWal(server.getKey(), path);
            }
        }
    }

    public void updateBulkImportStatus(String directory, BulkImportState state) {
        bulkImportStatus.updateBulkImportStatus(Collections.singletonList(directory), state);
    }

    public void removeBulkImportStatus(String directory) {
        bulkImportStatus.removeBulkImportStatus(Collections.singletonList(directory));
    }

    /**
     * Return how long (in milliseconds) there has been a master overseeing this cluster. This is an approximately monotonic clock, which will be approximately
     * consistent between different masters or different runs of the same master.
     */
    public Long getSteadyTime() {
        return timeKeeper.getTime();
    }

    @Override
    public boolean isActiveService() {
        if (null != masterLock) {
            return masterLock.isLocked();
        }
        return false;
    }
}