org.apache.hadoop.hdfs.server.namenode.AvatarNode.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.namenode.AvatarNode.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode;

import java.io.IOException;
import java.io.BufferedOutputStream;
import java.io.BufferedInputStream;
import java.io.DataOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileInputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.SocketTimeoutException;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;

import javax.management.NotCompliantMBeanException;
import javax.management.StandardMBean;

import org.apache.hadoop.ipc.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.ReconfigurationException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.hdfs.AvatarFailoverSnapshot;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.FastProtocolHDFS;
import org.apache.hadoop.hdfs.FastWritableHDFS;
import org.apache.hadoop.hdfs.FileStatusExtended;
import org.apache.hadoop.hdfs.OpenFilesInfo;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.util.FlushableLogger;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hdfs.protocol.AvatarProtocol;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.Avatar;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.StartupOption;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.InstanceId;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.protocol.AvatarDatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockFlags;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockReport;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.IncrementalBlockReport;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.ReceivedBlockInfo;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.datanode.DatanodeProtocols;
import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
import org.apache.hadoop.hdfs.server.namenode.ClusterJspHelper.NameNodeKey;
import org.apache.hadoop.hdfs.server.namenode.JournalStream.JournalType;
import org.apache.hadoop.hdfs.server.namenode.metrics.AvatarNodeMetrics;
import org.apache.hadoop.hdfs.server.namenode.metrics.AvatarNodeStatusMBean;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.hdfs.util.LightWeightBitSet;

/**
 * This is an implementation of the AvatarNode, a hot
 * standby for the NameNode.
 * This is really cool, believe me!
 * The AvatarNode has two avatars.. the Standby avatar and the Active
 * avatar.
 * 
 * In the Standby avatar, the AvatarNode is consuming transaction logs
 * generated by the primary (via a transaction log stored in a shared device).
 * Typically, the primary Namenode is writing transactions to a NFS filesystem
 * and the Standby is reading the log from the same NFS filesystem. The 
 * Standby is also making periodic checkpoints to the primary namenode.
 * 
 * A manual command can switch the AvatarNode from the Standby avatar
 * to the Active avatar. In the Active avatar, the AvatarNode performs precisely
 * the same functionality as a real usual Namenode. The switching from 
 * Standby avatar to the Active avatar is fast and can typically occur 
 * within seconds.
 *
 * Typically, an adminstrator will run require two shared mount points for
 * transaction logs. It has to be set in fs.name.dir.shared0 and
 * fs.name.dir.shared1 (similarly for edits). Then the adminstrator starts
 * the AvatarNode on two different machines as follows:
 *
 * bin/hadoop org.apache.hadoop.hdfs.server.namenode.AvatarNode -zero -active
 * bin/hadoop org.apache.hadoop.hdfs.server.namenode.AvatarNode -one -standby
 * The first  AvatarNode uses  fs.name.dir.shared0 while the second
 * AvatarNode uses fs.name.dir.shared1 to write its transaction logs.
 * Also, at startup, the first instance is the primary Namenode and the
 * second instance is the Standby
 *
 * After a while, the adminstrator decides to change the avatar of the
 * second instance to Active. In this case, he/she has to first ensure that the
 * first instance is really really dead. This code does not handle the
 * split-brain scenario where there are two active namenodes in one cluster.
 *
 */

public class AvatarNode extends NameNode implements AvatarProtocol, AvatarNodeStatusMBean {

    static {
        Configuration.addDefaultResource("avatar-default.xml");
        Configuration.addDefaultResource("avatar-site.xml");
    }

    public static final Log LOG = LogFactory.getLog(AvatarNode.class.getName());
    // immediate flush logger
    private static final Log FLOG = FlushableLogger.getLogger(LOG);

    private static final int INVALIDATES_CLEANUP_INTERVAL = 60 * 1000;

    public static final String FAILOVER_SNAPSHOT_FILE = "failover_snapshot_file";

    // The instanceId is assigned at startuptime and does not change for
    // the lifetime of the Node. The adminstrator has to name each instance
    // of the AvatarNode with a different instanceId. The node number is used 
    // by the AvaterNode to determine which shared devices it should use to
    // checkpoint the image.
    //
    private InstanceId instance;

    // The txid the fsimage was sync-ed from the remote AvatarNode
    volatile private long startCheckpointTxId;

    private Server server;
    /** RPC server */
    private InetSocketAddress serverAddress;
    /** RPC server address */
    private volatile Avatar currentAvatar; // the current incarnation of this node
    private Standby standby; // the standby object
    private Configuration confg; // config for the standby namenode
    private Configuration startupConf; // config for the namenode
    private Thread standbyThread; // the standby daemon thread
    private Cleaner cleaner; // The thread cleaning up invalidates and mis-replicated blocks
    private Thread cleanerThread;

    private RunInfo runInfo;
    private long sessionId;

    private StandbySafeMode standbySafeMode;
    private volatile boolean isInitialized = false;

    protected final boolean enableTestFramework;
    protected final boolean enableTestFrameworkFsck;

    private String failoverFsck = "";
    private String oldPrimaryFsck = "";
    private volatile FailoverState failoverState = FailoverState.BEFORE_FAILOVER;

    private final AvatarNodeMetrics metrics;

    static public enum FailoverState {
        BEFORE_FAILOVER("BeforeFailover"), START_FAILOVER("StartFailover"), FAILED_FAILOVER(
                "FailedFailover"), AWAIT_FAILOVER("AwaitFailover"), PERFORM_FAILOVER("PerformFailover");

        private String name = null;

        private FailoverState(String arg) {
            this.name = arg;
        }

        @Override
        public String toString() {
            return name;
        }
    }

    /**
     * The startup Conf is the original configuration of the AvatarNode. It is used by the
     * secondary namenode to talk to the primary namenode.
     * The conf is the modified configuration that is used by the standby namenode
     */
    AvatarNode(Configuration startupConf, Configuration conf, StartupInfo startInfo, RunInfo runInfo,
            long sessionId, InetSocketAddress nameNodeAddr, NamenodeProtocol primaryNamenode) throws IOException {
        super(conf);

        // wrap namenode metrics
        this.metrics = new AvatarNodeMetrics(super.getNameNodeMetrics());

        // check if we talk to primary
        if (startInfo.isStandby && (nameNodeAddr == null || primaryNamenode == null)) {
            throw new IOException("RPC to primary namenode not initialized");
        }

        this.sessionId = sessionId;
        this.runInfo = runInfo;
        this.instance = startInfo.instance;
        this.enableTestFramework = (conf.getFloat("dfs.avatarnode.failover.sample.percent", 0.0f) != 0.0f);
        this.enableTestFrameworkFsck = (conf.getBoolean("dfs.avatarnode.failover.fsck", false));

        // if we are starting as the standby then
        // record the fstime of the checkpoint that we are about to sync from
        if (startInfo.isStandby) {
            // Set the checkpoint time to the fstime of the image and edits
            // that were copied
            setStartCheckpointTxId(namesystem.getFSImage().storage.getMostRecentCheckpointTxId());
        }

        initialize(conf);
        currentAvatar = startInfo.isStandby ? Avatar.STANDBY : Avatar.ACTIVE;
        this.startupConf = startupConf;
        this.confg = conf;
        this.nameserviceId = startInfo.serviceName;

        if (currentAvatar == Avatar.STANDBY) {
            // Verify we have the correct safemode.
            SafeModeInfo safeMode = super.namesystem.getSafeModeInstance();
            if (safeMode == null || !(safeMode instanceof StandbySafeMode)) {
                throw new IOException("Invalid safe mode for Standby Avatar : " + safeMode
                        + " Standby Avatar should be using " + StandbySafeMode.class + " as its dfs.safemode.impl");
            }
            standbySafeMode = (StandbySafeMode) safeMode;

            // Standby has a different property for the max buffered transactions
            // to replay the log faster
            int maxStandbyBufferedTransactions = confg.getInt("dfs.max.standby.buffered.transactions",
                    HdfsConstants.DEFAULT_MAX_BUFFERED_TRANSACTIONS);
            FSEditLog.setMaxBufferedTransactions(maxStandbyBufferedTransactions);

            // Create a standby object which does the actual work of 
            // processing transactions from the primary and checkpointing
            standby = new Standby(this, startupConf, confg, nameNodeAddr, primaryNamenode);
            standbyThread = new Thread(standby);
            standbyThread.setName("Standby");
            standbyThread.start();
            cleaner = new Cleaner();
            cleanerThread = new Thread(cleaner);
            cleanerThread.start();
        }
        isInitialized = true;
    }

    protected void setFailoverFsck(String fsck) {
        failoverFsck = fsck;
    }

    /**
     * Wait for the StandbyNode to exit. If it does, then stop the underlying namenode.
     */
    public void waitForRestart() {
        if (standbyThread != null) {
            try {
                // if this is the standby avatarnode, then wait for the Standby to exit
                standbyThread.join();
            } catch (InterruptedException ie) {
                //eat it up
            }
            standbyThread = null;
            LOG.info("waitForRestart: Standby thread exited.");

            InjectionHandler.processEvent(InjectionEvent.AVATARNODE_WAIT_FOR_RESTART);
            while (failoverState == FailoverState.START_FAILOVER || failoverState == FailoverState.AWAIT_FAILOVER) {
                LOG.info("Current state : " + failoverState + ". Waiting for failover ....");
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ie) {
                    throw new RuntimeException("waitForRestart() interrupted");
                }
            }

            // if we are still in standbymode, that means we need to restart from
            // scratch.
            if (getAvatar() == Avatar.STANDBY) {
                runInfo.isRunning = false;
                LOG.info("waitForRestart Stopping encapsulated namenode.");
                super.stop(); // terminate encapsulated namenode
                super.join(); // wait for encapsulated namenode to exit
                shutdownStandby();
                LOG.info("waitForRestart exiting");
                return;
            }
        }
        super.join(); // wait for encapsulated namenode
    }

    public void registerMBean() {
        StandardMBean avatarNodeBean;
        try {
            avatarNodeBean = new StandardMBean(this, AvatarNodeStatusMBean.class);
            MBeanUtil.registerMBean("AvatarNode", "AvatarNodeState", avatarNodeBean);
        } catch (NotCompliantMBeanException mex) {
            LOG.error("Error registering mbean with JMX", mex);
        }
    }

    public AvatarNodeMetrics getAvatarNodeMetrics() {
        return metrics;
    }

    @Override
    public String getInstance() {
        return this.instance.toString();
    }

    @Override
    public String getState() {
        return this.currentAvatar.toString();
    }

    @Override
    public long getLagBytes() {
        return standby == null ? 0 : standby.getLagBytes();
    }

    public Configuration getStartupConf() {
        return this.startupConf;
    }

    /**
     * Initialize AvatarNode
     * @param conf the configuration
     */
    private void initialize(Configuration conf) throws IOException {
        InetSocketAddress socAddr = AvatarNode.getAddress(conf);
        int handlerCount = conf.getInt("hdfs.avatarnode.handler.count", 3);

        // create rpc server
        // no point to deserialize job file in Avatar node.
        this.server = RPC.getServer(this, socAddr.getAddress().getHostAddress(), socAddr.getPort(), handlerCount,
                false, conf, false);

        // The rpc-server port can be ephemeral... ensure we have the 
        // correct info
        this.serverAddress = this.server.getListenerAddress();
        LOG.info("AvatarNode up at: " + this.serverAddress);
        this.registerMBean();
        this.server.start();
    }

    /**
     * If the specified protocol is AvatarProtocol, then return the
     * AvatarProtocol version id, otherwise delegate to the underlying
     * namenode.
     */
    public long getProtocolVersion(String protocol, long clientVersion) throws IOException {
        if (protocol.equals(AvatarProtocol.class.getName())) {
            return AvatarProtocol.versionID;
        } else {
            return super.getProtocolVersion(protocol, clientVersion);
        }
    }

    //
    // methods to support Avatar Protocol
    //

    /**
     * @inheritDoc
     */
    public synchronized Avatar getAvatar() {
        return currentAvatar;
    }

    /**
     * @inheritDoc
     */
    public Avatar reportAvatar() {
        return currentAvatar;
    }

    /**
     * @inheritDoc
     */
    public boolean isInitialized() throws IOException {

        // check permissions, if this fails, the failover will not
        // progress, and this will prevent shutting down the primary avatar
        super.namesystem.checkSuperuserPrivilege();

        if (currentAvatar == Avatar.ACTIVE) {
            // check for upgrades
            if (!getFSImage().isUpgradeFinalized()) {
                throw new IOException("Failover: Upgrade must be finalized before failover");
            }
        } else /* STANDBY */ {
            // check if there are problems with checkpointing
            if (standby.getNumCheckpointFailures() > 0) {
                throw new IOException("Failover: Standby has problems with checkpointing");
            }
        }

        // for both check if the node is initialized
        if (!isInitialized) {
            throw new IOException("Failover: Standby is not initialized");
        }
        // for both check active edit streams
        verifyEditStreams();

        return true;
    }

    /**
     * @inheritDoc
     */
    protected boolean shouldCheckHeartbeat() {
        if (currentAvatar == Avatar.ACTIVE) {
            return super.shouldCheckHeartbeat();
        }
        return true;
    }

    private static class ShutdownAvatarThread extends Thread {
        private final AvatarNode node;

        public ShutdownAvatarThread(AvatarNode node) {
            this.node = node;
        }

        public void run() {
            try {
                node.runInfo.shutdown = true;
                LOG.info("Failover: Shutdown thread for " + node.currentAvatar + " starting...");
                if (node.currentAvatar == Avatar.STANDBY) {
                    // make sure that all transactions are consumed
                    try {
                        // do not recover the unclosed segment here.
                        node.standby.quiesce(FSEditLogLoader.TXID_IGNORE, false);
                    } catch (Throwable e) {
                        LOG.warn("Failover: standby error ", e);
                    }
                }
                // Need to stop RPC threads before capturing any final data about the
                // primary avatar.
                node.stopRPC(false);

                String fsck = "";
                try {
                    if (node.enableTestFramework && node.enableTestFrameworkFsck) {
                        LOG.info("Failover: Test framework - running fsck");
                        fsck = node.runFailoverFsck();
                        LOG.info("Failover: Test framework - fsck done");
                    }
                } catch (IOException e) {
                    /*ignore*/ }

                // check if the shared journal is still available
                node.verifyEditStreams();

                // stop the node (namesystem, fsimage, editlog, etc.)
                node.stop();
                node.join(); // wait for encapsulated namenode to exit

                long totalBlocks = node.namesystem.getBlocksTotal();
                if (InjectionHandler.falseCondition(InjectionEvent.AVATARNODE_SHUTDOWN, totalBlocks)) {
                    // simulate crash
                    return;
                }

                if (node.currentAvatar == Avatar.STANDBY) {
                    node.shutdownStandby();
                } else if (node.currentAvatar == Avatar.ACTIVE) {
                    // If we are the primary we need to sync our last transaction id to
                    // zookeeper.
                    node.writeFailoverTestData(fsck);
                    AvatarNodeZkUtil.writeLastTxidToZookeeper(node.getLastWrittenTxId(), totalBlocks,
                            node.namesystem.getFilesAndDirectoriesTotal(), node.sessionId, node.startupConf,
                            node.confg);
                }
                InjectionHandler.processEvent(InjectionEvent.AVATARNODE_SHUTDOWN_COMPLETE);
            } catch (Exception e) {
                LOG.error("Failover: shutdownAvatar() failed", e);
            } finally {
                LOG.info("Failover: Shutdown thread for " + node.currentAvatar + " DONE.");
            }
        }
    }

    /**
     * Return true if the shared journal is active, or if the number 
     * of active journals is equal to the number of configured journals. 
     * Throw IOException otherwise.
     */
    private void verifyEditStreams() throws IOException {
        // we check if the shared stream is still available
        if (getFSImage().getEditLog().isSharedJournalAvailable()
                && InjectionHandler.trueCondition(InjectionEvent.AVATARNODE_CHECKEDITSTREAMS)) {
            return;
        }

        // for sanity check if the number of available journals
        // is equal to the number of configured ones
        int expectedEditStreams = NNStorageConfiguration.getNamespaceEditsDirs(confg).size();
        int actualEditStreams = this.namesystem.getFSImage().getEditLog().getNumberOfAvailableJournals();
        if (expectedEditStreams == actualEditStreams
                && InjectionHandler.trueCondition(InjectionEvent.AVATARNODE_CHECKEDITSTREAMS)) {
            return;
        }

        String msg = "Failover: Cannot proceed - shared journal is not available. "
                + "Number of required edit streams: " + expectedEditStreams + " current number: "
                + actualEditStreams;
        LOG.fatal(msg);
        throw new IOException(msg);
    }

    /**
     * Shuts down the avatar node
     * @param synchronous - should the function wait for the shutdown to complete
     * @throws IOException
     */
    public synchronized void shutdown(boolean synchronous) throws IOException {
        LOG.info("Failover: Asynchronous shutdown for: " + currentAvatar);

        // check permissions before any other actions
        super.namesystem.checkSuperuserPrivilege();

        if (runInfo.shutdown) {
            LOG.info("Failover: Node already shut down");
            return;
        }

        // check edit streams
        // if this fails, we still have a chance to fix it
        // and shutdown again
        verifyEditStreams();

        runInfo.shutdown = true;
        Thread shutdownThread = new ShutdownAvatarThread(this);
        shutdownThread.setName("ShutDown thread for : " + serverAddress);
        shutdownThread.setDaemon(false);
        shutdownThread.start();

        if (synchronous) {
            LOG.info("Failover: Waiting for shutdown to complete");
            try {
                shutdownThread.join();
            } catch (InterruptedException ie) {
                throw new IOException(ie);
            }
        }
    }

    @Override
    public void shutdownAvatar() throws IOException {
        shutdown(false);
    }

    /**
     * Used only for testing.
     */
    public Standby getStandby() throws IOException {
        if (currentAvatar != Avatar.STANDBY) {
            throw new IOException("This is not the standby avatar");
        }
        return standby;
    }

    public long getSessionId() throws IOException {
        if (currentAvatar != Avatar.ACTIVE) {
            throw new IOException("This is not the primary avatar");
        }
        return this.sessionId;
    }

    /**
     * Used only for testing.
     */
    public void quiesceStandby(long txId) throws IOException {
        if (currentAvatar != Avatar.STANDBY) {
            throw new IOException("This is not the standby avatar");
        }
        standby.quiesce(txId);
    }

    public void shutdownStandby() {
        standby.shutdown();

        if (server != null) { // shutdown the AvatarNode
            LOG.info("Stopping avatarnode rpcserver.");
            server.stop();
            try {
                server.join();
            } catch (InterruptedException ie) {
                //eat it up
            }
        }
        if (cleaner != null) {
            // Shut down the cleaner thread as it will keep
            // the process from shutting down
            cleaner.stop();
            cleanerThread.interrupt();
            try {
                cleanerThread.join();
            } catch (InterruptedException iex) {
                Thread.currentThread().interrupt();
            }
        }
    }

    /**
     * Stops all RPC threads and ensures that all RPC handlers have exited.
     * Stops all communication to the namenode.
     */
    protected void stopRPC(boolean interruptClientHandlers) throws IOException {
        try {
            // stop avatardatanode server
            stopRPCInternal(server, "avatardatanode", interruptClientHandlers);

            // stop namenode rpc (client, datanode)
            super.stopRPC(interruptClientHandlers);

            // wait for avatardatanode rpc
            stopWaitRPCInternal(server, "avatardatanode");
        } catch (InterruptedException ex) {
            throw new IOException("stopRPC() interrupted", ex);
        }
    }

    private void verifyTransactionIds(ZookeeperTxId zkTxId) throws IOException {
        // TODO for unit test it can happen than rollEditLog happens after 
        // obtaining the txid so we might have a difference of 2 !!!
        long zkLastTxId = zkTxId.getTransactionId();
        long zkTotalBlocks = zkTxId.getTotalBlocks();
        long zkTotalInodes = zkTxId.getTotalInodes();

        // local values
        long lastTxId = super.getLastWrittenTxId();
        long totalBlocks = super.namesystem.getBlocksTotal();
        long totalINodes = super.namesystem.getFilesAndDirectoriesTotal();

        // Verify transacation ids.
        if (lastTxId < 0 || zkLastTxId < 0) {
            throw new StandbyStateException("Invalid transacation ids, txid in NameNode : " + lastTxId
                    + " txid in Zookeeper : " + zkLastTxId);
        } else if (lastTxId != zkLastTxId) {
            throw new StandbyStateException("The transacation id in the namenode : " + lastTxId
                    + " does not match the transaction id in zookeeper : " + zkLastTxId
                    + formatErrorMessage(lastTxId, zkLastTxId, "transactions"));
        } else if (zkTotalBlocks != totalBlocks) {
            throw new StandbyStateException(
                    "Total blocks in ZK : " + zkTotalBlocks + " don't match up with total blocks on Standby : "
                            + totalBlocks + formatErrorMessage(totalBlocks, zkTotalBlocks, "blocks"));
        } else if (zkTotalInodes != totalINodes) {
            throw new StandbyStateException(
                    "Total inodes in ZK : " + zkTotalInodes + " don't match up with total inodes on Standby : "
                            + totalINodes + formatErrorMessage(totalINodes, zkTotalInodes, "inodes"));
        }
    }

    private String formatErrorMessage(long localNumber, long remoteNumber, String what) {
        long diff = localNumber - remoteNumber;
        return " Standby has " + Math.abs(diff) + " " + ((diff > 0) ? "more " : "fewer ") + what + ".";
    }

    private static File buildSnapshotFilePath(String pathDir) {
        return new File(pathDir, Path.SEPARATOR + FAILOVER_SNAPSHOT_FILE);
    }

    private File getSnapshotFile(Configuration conf, boolean remote) throws IOException {
        URI sharedEditsDirectory = remote ? getRemoteSharedEditsURI(confg) : getLocalSharedEditsURI(confg);
        if (!isFile(sharedEditsDirectory)) {
            String failoverDataDir = conf.get("dfs.avatarnode.failover.test.data.dir");
            if (failoverDataDir != null) {
                return buildSnapshotFilePath(failoverDataDir);
            }
            return null;
        }
        return buildSnapshotFilePath(sharedEditsDirectory.getPath());
    }

    private void writeFailoverTestData(String fsck) throws IOException {
        if (!enableTestFramework) {
            LOG.info("Failover: Test framework - disabled");
            return;
        }
        File snapshotFile = getSnapshotFile(confg, true);
        if (snapshotFile == null)
            return;

        float samplePercent = confg.getFloat("dfs.avatarnode.failover.sample.percent", 0.05f);
        LOG.info("Failover: Test framework - using " + (100.0 * samplePercent) + " % sample size");
        List<FileStatusExtended> stat = super.getRandomFilesSample(samplePercent);
        AvatarFailoverSnapshot snapshot = new AvatarFailoverSnapshot(super.namesystem.getOpenFiles(), stat);
        DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(snapshotFile)));
        try {
            snapshot.write(out);
            out.writeBoolean(enableTestFrameworkFsck);
            if (enableTestFrameworkFsck) {
                Text.writeString(out, fsck);
            }
        } finally {
            out.close();
        }
        LOG.info("Failover: Test framework - saved snapshot file : " + snapshotFile);
    }

    private void verifySnapshotSampledFile(FileStatusExtended file) throws IOException {
        FileStatusExtended stat = super.namesystem.getFileInfoExtended(file.getPath().toString());
        if (!stat.equals(file)) {
            throw new IOException("Information for file : " + file.getPath()
                    + " does not match with information on snapshot file, expected : " + file + ", actual : "
                    + stat);
        }
    }

    private void verifyOpenFiles(OpenFilesInfo openFilesInfo) throws IOException {
        if (openFilesInfo.getGenStamp() != super.namesystem.getGenerationStamp()) {
            throw new IOException("GS on snapshot file doesn't match with GS on node : "
                    + openFilesInfo.getGenStamp() + ", " + super.namesystem.getGenerationStamp());
        }
        for (FileStatusExtended stat : openFilesInfo.getOpenFiles()) {
            verifySnapshotSampledFile(stat);
        }
    }

    private void verifyFailoverTestData() throws IOException {
        if (!enableTestFramework) {
            LOG.info("Failover: Test framework - disabled");
            return;
        }
        String fsck = "";
        LOG.info("Failover: Test framework - verification - starting...");
        AvatarFailoverSnapshot snapshot = new AvatarFailoverSnapshot();
        File snapshotFile = getSnapshotFile(confg, false);
        if (snapshotFile == null)
            return;
        DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(snapshotFile)));
        try {
            snapshot.readFields(in);
            if (in.readBoolean()) {
                LOG.info("Failover: Test framework - found fsck data");
                fsck = Text.readString(in);
            }
        } finally {
            in.close();
        }

        LOG.info("Failover: Test framework - verifying open files: found "
                + snapshot.getOpenFilesInfo().getOpenFiles().size() + " files in the test snapshot");
        verifyOpenFiles(snapshot.getOpenFilesInfo());

        LOG.info("Failover: Test framework - verifying closed files: found " + snapshot.getSampledFiles().size()
                + " files in the test snapshot");
        for (FileStatusExtended stat : snapshot.getSampledFiles()) {
            verifySnapshotSampledFile(stat);
        }

        LOG.info("Failover: Test framework - verification - succeeded");
        this.oldPrimaryFsck = fsck;
    }

    protected String runFailoverFsck() throws IOException {
        Map<String, String[]> pmap = new HashMap<String, String[]>();
        pmap.put("path", new String[] { "/" });

        // run fsck
        StringWriter stringWriter = new StringWriter();
        NamenodeFsck fscker = new NamenodeFsck(confg, this, pmap, new PrintWriter(stringWriter));
        fscker.fsck();
        return stringWriter.toString();
    }

    /**
     * Clean failover-outstanding datanodes.
     * Used for fast failover, once the overreplication processing starts.
     */
    void clearOutstandingNodes() {
        if (standbySafeMode != null) {
            standbySafeMode.removeOutStandingDatanodes(false);
        }
        standbySafeMode = null;
    }

    @Override
    public synchronized void performFailover() throws IOException {
        // check permissions before any other actions
        super.namesystem.checkSuperuserPrivilege();

        failoverState = FailoverState.PERFORM_FAILOVER;
        enforceStandby("Cannot perform failover on active");
        cleaner.stop();
        cleanerThread.interrupt();
        try {
            cleanerThread.join();
        } catch (InterruptedException iex) {
            Thread.currentThread().interrupt();
        }

        // change the value to the one for the primary
        int maxStandbyBufferedTransactions = confg.getInt("dfs.max.buffered.transactions",
                HdfsConstants.DEFAULT_MAX_BUFFERED_TRANSACTIONS);
        FSEditLog.setMaxBufferedTransactions(maxStandbyBufferedTransactions);

        // Clear up deletion and replication queues.
        clearInvalidateAndReplicationQueues();

        // if the log was closed by ingestion, re-open it
        if (!getFSImage().getEditLog().isOpen())
            getFSImage().getEditLog().open();

        LOG.info("Failover: Triggering safemode failover");
        standbySafeMode.triggerFailover();

        sessionId = AvatarNodeZkUtil.writeToZooKeeperAfterFailover(startupConf, confg);
        LOG.info("Failover: Changed avatar from " + currentAvatar + " to " + Avatar.ACTIVE);
        if (enableTestFramework && enableTestFrameworkFsck) {
            if (!failoverFsck.equals(oldPrimaryFsck)) {
                LOG.warn("Failover: FSCK on old primary and new primary do not match");
                LOG.info("----- FSCK ----- OLD BEGIN");
                LOG.info("Failover: Old primary fsck: \n " + oldPrimaryFsck + "\n");
                LOG.info("----- FSCK ----- NEW BEGIN");
                LOG.info("Failover: New primary fsck: \n " + failoverFsck + "\n");
                LOG.info("----- FSCK ----- END");
            } else {
                LOG.info("Failover: Verified fsck.");
            }
        }

        currentAvatar = Avatar.ACTIVE;
        confg.setClass("dfs.safemode.impl", NameNodeSafeModeInfo.class, SafeModeInfo.class);
    }

    @Override
    public synchronized void quiesceForFailover(boolean noverification) throws IOException {
        // check permissions before any other actions
        super.namesystem.checkSuperuserPrivilege();

        failoverState = FailoverState.START_FAILOVER;
        enforceStandby("Cannot quiesce primary");
        try {
            // Check to see if the primary is somehow checkpointing itself. If so,
            // then
            // refuse to switch to active mode. This check is not foolproof but is a
            // defensive mechanism to prevent administrator errors.
            ZookeeperTxId zkTxId = AvatarNodeZkUtil.checkZooKeeperBeforeFailover(startupConf, startupConf,
                    noverification);

            if (!noverification) {
                if (zkTxId == null) {
                    throw new IOException("Could not receive last transaction id from zookeeper");
                }
                standby.quiesce(zkTxId.getTransactionId());
            } else {
                standby.quiesce(FSEditLogLoader.TXID_IGNORE);
            }

            if (!noverification) {
                verifyTransactionIds(zkTxId);
                verifyFailoverTestData();
            }
            failoverState = FailoverState.AWAIT_FAILOVER;
        } catch (StandbyStateException se) {
            failoverState = FailoverState.AWAIT_FAILOVER;
            throw se;
        } finally {
            if (failoverState != FailoverState.AWAIT_FAILOVER) {
                failoverState = FailoverState.FAILED_FAILOVER;
            }
        }
    }

    /**
     * @deprecated
     */
    @Override
    public synchronized void setAvatar(Avatar avatar) throws IOException {
        setAvatar(avatar, false);
    }

    /**
     * @deprecated
     */
    @Override
    public synchronized void setAvatar(Avatar avatar, boolean force) throws IOException {
        try {
            if (avatar == currentAvatar) {
                LOG.info("Failover: Trying to change avatar to " + avatar + " but am already in that state.");
                return;
            }
            quiesceForFailover(force);
            performFailover();
        } catch (IOException e) {
            LOG.fatal("Exception when setting avatar", e);
            throw e;
        }
    }

    /*
     * As the AvatarNode is running in Standby mode it fills up
     * invalidates queues for each datanode with blocks it
     * assumes have to be deleted. This information is not
     * entirely accurate and fills up memory as well as leads
     * to dataloss since those queues are flushed to the datanodes
     * on failover and valid blocks may be deleted.
     * 
     * To help prevent filling up the memory we clear these queues
     * periodically. And we do a final cleanup jsut before switching
     * to primary.
     * 
     * Also, we make sure that the replication queues are cleaned 
     * periodically. They are never processed at standby, and grow
     * indefinitely.
     */
    private class Cleaner implements Runnable {

        volatile boolean running = true;

        @Override
        public void run() {
            LOG.info("Starting Standby Cleaner thread");
            while (running) {
                clearInvalidateAndReplicationQueues();
                try {
                    Thread.sleep(INVALIDATES_CLEANUP_INTERVAL);
                } catch (InterruptedException iex) {
                    if (running == false)
                        return;
                    Thread.currentThread().interrupt();
                }
            }
        }

        public void stop() {
            running = false;
        }

    }

    private void clearInvalidateAndReplicationQueues() {
        try {
            LOG.info("Standby Cleaner: cleaning queues");
            if (!namesystem.isInSafeMode()) {
                throw new IOException("Avatar is not in safemode");
            }
            DatanodeInfo[] nodes;
            super.namesystem.readLock();
            try {
                nodes = super.namesystem.getDatanodes(DatanodeReportType.ALL);
            } finally {
                super.namesystem.readUnlock();
            }
            super.namesystem.writeLock();
            try {
                for (DatanodeInfo node : nodes) {
                    super.namesystem.removeFromInvalidates(node.getStorageID());
                }
            } finally {
                super.namesystem.writeUnlock();
            }
            super.namesystem.clearReplicationQueues();
        } catch (Exception e) {
            metrics.numCleanerThreadExceptions.inc();
            LOG.error("Standby Cleaner : exception when cleaning " + "replication queues", e);
        }
    }

    private boolean ignoreDatanodes() {
        boolean ignore = currentAvatar == Avatar.STANDBY && (standby == null || standby.fellBehind()
                || InjectionHandler.falseCondition(InjectionEvent.STANDBY_FELL_BEHIND));
        metrics.ignoreDataNodes.set(ignore ? 1 : 0);
        return ignore;
    }

    @Override
    public void primaryCleared(DatanodeRegistration registration) {
        LOG.info("Received primaryCleared() from : " + registration);
        if (standbySafeMode != null) {
            standbySafeMode.reportPrimaryCleared(registration);
        }
    }

    @Override
    public DatanodeRegistration register(DatanodeRegistration nodeReg) throws IOException {
        DatanodeRegistration reg = super.register(nodeReg);
        if (standbySafeMode != null) {
            standbySafeMode.reportRegister(nodeReg);
        }
        return reg;
    }

    public DatanodeCommand[] sendHeartbeatNew(DatanodeRegistration registration, long capacity, long dfsUsed,
            long remaining, long namespaceUsed, int xmitsInProgress, int xceiverCount) throws IOException {
        DatanodeCommand[] cmds = super.sendHeartbeat(registration, capacity, dfsUsed, remaining, namespaceUsed,
                xmitsInProgress, xceiverCount);

        if (standbySafeMode != null && standbySafeMode.reportHeartBeat(registration)) {
            LOG.info("Sending Clear Primary command to : " + registration);
            cmds = addCommand(cmds, AvatarDatanodeCommand.CLEARPRIMARY);
        } else if (ignoreDatanodes()) {
            cmds = addCommand(cmds, AvatarDatanodeCommand.BACKOFF);
        } else if (standbySafeMode != null && standbySafeMode.getPrepareFailover()) {
            cmds = addCommand(cmds, AvatarDatanodeCommand.PREPAREFAILOVER);
        }
        return cmds;
    }

    private DatanodeCommand[] addCommand(DatanodeCommand[] cmds, DatanodeCommand toAdd) {
        if (cmds == null) {
            return new DatanodeCommand[] { toAdd };
        } else {
            DatanodeCommand[] newCmds = Arrays.copyOf(cmds, cmds.length + 1);
            newCmds[cmds.length] = toAdd;
            return newCmds;
        }
    }

    @Override
    /**
     * Determines whether or not the datanode should retry blocks if they are
     * not present in the blocks map.
     */
    public boolean shouldRetryAbsentBlocks() {
        return (currentAvatar == Avatar.STANDBY);
    }

    @Override
    /**
     * Determines whether or not the given block should be retried by the datanode
     * if its not present in the blocksMap.
     */
    public boolean shouldRetryAbsentBlock(Block block, Block storedBlock) {
        // If this block does not belong to anyfile and its GS
        // is no less than the avatar node's GS,
        // AvatarNode may not consume the file/block creation edit log yet,
        // so adding it to the retry list.
        return (currentAvatar == Avatar.STANDBY
                && (!namesystem.getPersistBlocks() || block.getGenerationStamp() >= namesystem.getGenerationStamp())
                && (storedBlock == null || block.getGenerationStamp() > storedBlock.getGenerationStamp()));
    }

    public DatanodeCommand blockReportNew(DatanodeRegistration nodeReg, BlockReport rep) throws IOException {
        if (runInfo.shutdown || !runInfo.isRunning) {
            return null;
        }
        if (ignoreDatanodes()) {
            LOG.info("Standby fell behind. Telling " + nodeReg.toString() + " to back off");
            // Do not process block reports yet as the ingest thread is catching up
            return AvatarDatanodeCommand.BACKOFF;
        }

        if (currentAvatar == Avatar.STANDBY) {
            Collection<Block> failed = super.blockReportWithRetries(nodeReg, rep);

            // standby should send only DNA_RETRY
            BlockCommand bCmd = new BlockCommand(DatanodeProtocols.DNA_RETRY,
                    failed.toArray(new Block[failed.size()]));
            return bCmd;
        } else {
            // only the primary can send DNA_FINALIZE
            return super.blockReport(nodeReg, rep);
        }
    }

    private void updateIBRMetrics(int reported, int retried) {
        if (metrics != null) {
            metrics.numReportedBlocks.inc(reported);
            metrics.numRetryBlocks.inc(retried);
        }
    }

    /**
     * @inheritDoc
     */
    public Block[] blockReceivedAndDeletedNew(DatanodeRegistration nodeReg, Block blocksReceivedAndDeleted[])
            throws IOException {
        if (runInfo.shutdown || !runInfo.isRunning) {
            // Do not attempt to process blocks when
            // the namenode is not running
            return new ReceivedBlockInfo[0];
        }
        if (ignoreDatanodes()) {
            LOG.info("Standby fell behind. Telling " + nodeReg.toString() + " to retry incremental block report of "
                    + blocksReceivedAndDeleted.length + " blocks later.");
            metrics.numIgnoredDatanodes.inc();
            return blocksReceivedAndDeleted;
        }
        List<Block> failed = new ArrayList<Block>();
        HashSet<Long> failedIds;
        if (currentAvatar == Avatar.STANDBY) {
            failedIds = new HashSet<Long>();
            namesystem.writeLock();
            try {
                for (int index = 0; index < blocksReceivedAndDeleted.length; index++) {
                    Block blockRD = blocksReceivedAndDeleted[index];
                    if (failedIds.contains(blockRD.getBlockId())) {
                        // check if there was no other blocking failed request
                        blocksReceivedAndDeleted[index] = null;
                        failed.add(blockRD);
                        continue;
                    }
                    BlockInfo storedBlock = namesystem.blocksMap.getStoredBlock(blockRD);
                    if (!DFSUtil.isDeleted(blockRD) && shouldRetryAbsentBlock(blockRD, storedBlock)) {
                        // If this block does not belong to anyfile and its GS
                        // is no less than the avatar node's GS,
                        // AvatarNode may not consume the file/block creation edit log yet,
                        // so adding it to the failed list.
                        // - do not process any requestes for blocks with the same block id
                        // (also add them to the failed list.
                        // - do not block other requests
                        blocksReceivedAndDeleted[index] = null;
                        failed.add(blockRD);
                        failedIds.add(blockRD.getBlockId());
                    }
                }
            } finally {
                namesystem.writeUnlock();
                if (!failed.isEmpty()) {
                    LOG.info("*BLOCK* NameNode.blockReceivedAndDeleted: " + "from " + nodeReg.getName()
                            + " has to retry " + failed.size() + " blocks.");
                }
                for (Block blockRD : failed) {
                    LOG.info("blockReceivedDeleted " + (DFSUtil.isDeleted(blockRD) ? "DELETED" : "RECEIVED")
                            + " request received for " + blockRD + " on " + nodeReg.getName() + " size "
                            + blockRD.getNumBytes() + " But it does not belong to any file." + " Retry later.");
                }
            }
        }
        super.blockReceivedAndDeleted(nodeReg, blocksReceivedAndDeleted);
        updateIBRMetrics(blocksReceivedAndDeleted.length, failed.size());
        return failed.toArray(new Block[failed.size()]);
    }

    /**
     * @inheritDoc
     */
    public long[] blockReceivedAndDeletedNew(DatanodeRegistration nodeReg,
            IncrementalBlockReport receivedAndDeletedBlocks) throws IOException {
        InjectionHandler.processEvent(InjectionEvent.AVATARNODE_BLOCKRECEIVED_AND_DELETED_NEW);
        long[] failedMap = null;
        if (runInfo.shutdown || !runInfo.isRunning) {
            // Do not attempt to process blocks when
            // the namenode is not running
            if (currentAvatar == Avatar.STANDBY) {
                return new long[0];
            } else {
                return null;
            }
        }
        HashSet<Long> failedIds;
        if (currentAvatar == Avatar.STANDBY) {
            int noAck = receivedAndDeletedBlocks.getLength();

            // retry all block if the standby is behind consuming edits
            if (ignoreDatanodes()) {
                LOG.info("Standby fell behind. Telling " + nodeReg.toString()
                        + " to retry incremental block report of " + noAck + " blocks later.");
                failedMap = LightWeightBitSet.getBitSet(noAck);
                for (int i = 0; i < noAck; i++)
                    LightWeightBitSet.set(failedMap, i);
                metrics.numIgnoredDatanodes.inc();
                return failedMap;
            }

            Block blockRD = new Block();
            failedIds = new HashSet<Long>();
            failedMap = LightWeightBitSet.getBitSet(noAck);
            namesystem.writeLock();
            try {
                receivedAndDeletedBlocks.resetIterator();
                for (int currentBlock = 0; currentBlock < noAck; currentBlock++) {
                    receivedAndDeletedBlocks.getNext(blockRD);
                    if (failedIds.contains(blockRD.getBlockId())) {
                        // check if there was no other blocking failed request
                        blockRD.setNumBytes(BlockFlags.IGNORE);
                        receivedAndDeletedBlocks.setBlock(blockRD, currentBlock);
                        LightWeightBitSet.set(failedMap, currentBlock);
                        continue;
                    }
                    BlockInfo storedBlock = namesystem.blocksMap.getStoredBlock(blockRD);
                    if ((!DFSUtil.isDeleted(blockRD) && shouldRetryAbsentBlock(blockRD, storedBlock))) {
                        // If this block does not belong to anyfile and its GS
                        // is no less than the avatar node's GS,
                        // AvatarNode may not consume the file/block creation edit log yet,
                        // so adding it to the failed list. Also, if the stored block GS is
                        // less than the reported GS then we need to retry the block.
                        // - do not process any requestes for blocks with the same block id
                        // (also add them to the failed list.
                        // - do not block other requests
                        blockRD.setNumBytes(BlockFlags.IGNORE);
                        receivedAndDeletedBlocks.setBlock(blockRD, currentBlock);
                        LightWeightBitSet.set(failedMap, currentBlock);
                        failedIds.add(blockRD.getBlockId());
                    }
                }
            } finally {
                namesystem.writeUnlock();
                if (failedMap != null && LightWeightBitSet.cardinality(failedMap) != 0) {
                    LOG.info("*BLOCK* NameNode.blockReceivedAndDeleted: " + "from " + nodeReg.getName()
                            + " has to retry " + LightWeightBitSet.cardinality(failedMap) + " blocks.");
                }
                receivedAndDeletedBlocks.resetIterator();
                for (int currentBlock = 0; currentBlock < noAck; currentBlock++) {
                    receivedAndDeletedBlocks.getNext(blockRD);
                    if (!LightWeightBitSet.get(failedMap, currentBlock))
                        continue;
                    LOG.info("blockReceivedDeleted " + (DFSUtil.isDeleted(blockRD) ? "DELETED" : "RECEIVED")
                            + " request received for " + blockRD + " on " + nodeReg.getName() + " size "
                            + blockRD.getNumBytes() + " But it does not belong to any file." + " Retry later.");
                }
            }
        }
        super.blockReceivedAndDeleted(nodeReg, receivedAndDeletedBlocks);
        updateIBRMetrics(receivedAndDeletedBlocks.getLength(),
                failedMap != null ? LightWeightBitSet.cardinality(failedMap) : 0);
        return failedMap;
    }

    /**
     * Roll the edit log.
     */
    public CheckpointSignature rollEditLog() throws IOException {
        enforceActive("Cannot roll edit log on standby");
        verifyCheckpointerAddress();
        return super.rollEditLog();
    }

    /**
     * Roll the edit log manually.
     */
    @Override
    public void rollEditLogAdmin() throws IOException {
        enforceActive("Cannot roll edit log on standby");
        // no verification of the checkpointer address since this is dfsadmin call
        super.rollEditLog();
    }

    /**
     * Roll the image 
     */
    public void rollFsImage(CheckpointSignature newImageSignature) throws IOException {
        enforceActive("Cannot roll image on standby");
        verifyCheckpointerAddress();
        super.rollFsImage(newImageSignature);
    }

    @Override
    public void saveNamespace() throws IOException {
        this.saveNamespace(false, false);
    }

    @Override
    public void saveNamespace(boolean force, boolean uncompressed) throws IOException {
        // for manually triggered SN, reset earlier cancellations
        namesystem.clearCancelSaveNamespace();
        if (currentAvatar == Avatar.ACTIVE) {
            // regular saving process
            super.saveNamespace(force, uncompressed);
        } else /* STANDBY */ {
            // "force" is not needed for standby since it's always
            // in safemode
            standby.triggerCheckpoint(uncompressed);
        }
    }

    /**
     * @inheritDoc
     */
    @Override
    public boolean setSafeMode(SafeModeAction action) throws IOException {
        if (action == SafeModeAction.SAFEMODE_PREP_FAILOVER) {
            if (currentAvatar == Avatar.ACTIVE) {
                // for now, we do not take any actions
            } else /* STANDBY */ {
                if (standby != null) {
                    standby.disableCheckpoint();
                    namesystem.cancelSaveNamespace("Prepare for failover");
                }
                if (standbySafeMode != null) {
                    // inform the safemode to prepare failover
                    standbySafeMode.setPrepareFailover(true);
                }
            }
            return namesystem.isInSafeMode();
        }
        return super.setSafeMode(action);
    }

    void enforceActive(String msg) throws IOException {
        if (currentAvatar == Avatar.STANDBY) {
            throw new IOException(msg);
        }
    }

    private void enforceStandby(String msg) throws IOException {
        if (currentAvatar != Avatar.STANDBY) {
            throw new IOException(msg);
        }
    }

    /** 
     * Register standby with this primary
     */
    @Override
    public int register() throws IOException {
        enforceActive("Standby can only register with active namenode");
        verifyCheckpointerAddress();
        return DataTransferProtocol.DATA_TRANSFER_VERSION;
    }

    private void verifyCheckpointerAddress() throws IOException {
        InetAddress configuredRemoteAddress = getRemoteNamenodeAddress(getConf(), instance).getAddress();
        validateCheckpointerAddress(configuredRemoteAddress);
    }

    /**
     * Returns the hostname:port for the AvatarNode. The default
     * port for the AvatarNode is (the client RPC port 
     * of the underlying namenode + 1)
     */
    public static InetSocketAddress getAddress(Configuration conf) {
        InetSocketAddress u = NameNode.getClientProtocolAddress(conf);
        int port = conf.getInt(AvatarNode.DFS_AVATARNODE_PORT_KEY, u.getPort() + 1);
        return new InetSocketAddress(u.getAddress(), port);
    }

    /**
     * Help message for a user
     */
    private static void printUsage() {
        System.err.println("Usage: java AvatarNode [" + StartupOption.STANDBY.getName() + "] | ["
                + StartupOption.NODEZERO.getName() + "] | [" + StartupOption.NODEONE.getName() + "] | ["
                + StartupOption.FORMAT.getName() + "] | [" + StartupOption.UPGRADE.getName() + "] | ["
                + StartupOption.ROLLBACK.getName() + "] | [" + StartupOption.FINALIZE.getName() + "] | ["
                + StartupOption.IMPORT.getName() + "]");
    }

    /**
     * validates command line arguments
     */
    static void validateStartupOptions(StartupInfo startInfo) throws IOException {
        // sync cannot be specified along with format or finalize
        if (startInfo.isStandby) {
            if (startInfo.startOpt == StartupOption.FORMAT || startInfo.startOpt == StartupOption.FINALIZE
                    || startInfo.startOpt == StartupOption.ROLLBACK
                    || startInfo.startOpt == StartupOption.UPGRADE) {
                throw new IOException(
                        "Standby avatar node cannot be started with " + startInfo.startOpt + " option.");
            }
        }
    }

    static class StartupInfo {
        StartupOption startOpt;
        InstanceId instance;
        boolean isStandby;
        String serviceName;
        boolean forceStartup;

        public StartupInfo(StartupOption startOpt, InstanceId instance, boolean isStandby, String serviceName,
                boolean forceStartup) {
            this.startOpt = startOpt;
            this.instance = instance;
            this.isStandby = isStandby;
            this.serviceName = serviceName;
            this.forceStartup = forceStartup;
        }
    }

    /**
     * Analyze the command line options
     */
    private static StartupInfo parseArguments(String args[]) {
        InstanceId instance = InstanceId.NODEZERO;
        StartupOption startOpt = StartupOption.REGULAR;
        boolean isStandby = false;
        String serviceName = null;
        boolean force = false;
        int argsLen = (args == null) ? 0 : args.length;
        for (int i = 0; i < argsLen; i++) {
            String cmd = args[i];
            if (StartupOption.SERVICE.getName().equalsIgnoreCase(cmd)) {
                if (++i < argsLen) {
                    serviceName = args[i];
                } else {
                    return null;
                }
            } else if (StartupOption.STANDBY.getName().equalsIgnoreCase(cmd)) {
                isStandby = true;
            } else if (StartupOption.NODEZERO.getName().equalsIgnoreCase(cmd)) {
                instance = InstanceId.NODEZERO;
            } else if (StartupOption.NODEONE.getName().equalsIgnoreCase(cmd)) {
                instance = InstanceId.NODEONE;
            } else if (StartupOption.FORMAT.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.FORMAT;
            } else if (StartupOption.FORMATFORCE.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.FORMATFORCE;
            } else if (StartupOption.REGULAR.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.REGULAR;
            } else if (StartupOption.UPGRADE.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.UPGRADE;
            } else if (StartupOption.ROLLBACK.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.ROLLBACK;
            } else if (StartupOption.FINALIZE.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.FINALIZE;
            } else if (StartupOption.IMPORT.getName().equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.IMPORT;
            } else if (StartupOption.FORCE.getName().equalsIgnoreCase(cmd)) {
                force = true;
            } else {
                return null;
            }
        }
        return new StartupInfo(startOpt, instance, isStandby, serviceName, force);
    }

    /**
     * Records the startup command in the configuration
     */
    private static void setStartupOption(Configuration conf, StartupOption opt) {
        conf.set("dfs.namenode.startup", opt.toString());
    }

    public static AvatarNode createAvatarNode(String argv[], Configuration conf) throws IOException {
        return createAvatarNode(argv, conf, new RunInfo());
    }

    /**
     * HDFS federation configuration that is specific to a name service. 
     * This keys are suffixed with nameserviceId in the configuration. For example,
     * "dfs.namenode.rpc-address.nameservice1".</li>
     * </ol>
     * 
     * Following are nameservice specific keys.
     */
    final private static String DFS_AVATARNODE_PORT_KEY = "dfs.avatarnode.port";
    final private static String DFS_SHARED_NAME_DIR0_KEY = "dfs.name.dir.shared0";
    final private static String DFS_SHARED_NAME_DIR1_KEY = "dfs.name.dir.shared1";
    final public static String DFS_SHARED_EDITS_DIR0_KEY = "dfs.name.edits.dir.shared0";
    final public static String DFS_SHARED_EDITS_DIR1_KEY = "dfs.name.edits.dir.shared1";
    final private static String ZERO = "0";
    final private static String ONE = "1";
    final public static String DFS_NAMENODE_RPC_ADDRESS0_KEY = DFS_NAMENODE_RPC_ADDRESS_KEY + ZERO;
    final public static String DFS_NAMENODE_RPC_ADDRESS1_KEY = DFS_NAMENODE_RPC_ADDRESS_KEY + ONE;

    public static final String[] AVATARSERVICE_SPECIFIC_KEYS = { DFS_AVATARNODE_PORT_KEY,
            DFS_NAMENODE_RPC_ADDRESS0_KEY, DFS_NAMENODE_RPC_ADDRESS1_KEY, DATANODE_PROTOCOL_ADDRESS + ZERO,
            DATANODE_PROTOCOL_ADDRESS + ONE, DFS_NAMENODE_HTTP_ADDRESS_KEY + ZERO,
            DFS_NAMENODE_HTTP_ADDRESS_KEY + ONE, DFS_SHARED_NAME_DIR0_KEY, DFS_SHARED_NAME_DIR1_KEY,
            DFS_SHARED_EDITS_DIR0_KEY, DFS_SHARED_EDITS_DIR1_KEY, };

    /**  
     * In federation configuration is set for a set of
     * avartanodes, namenodes etc, which are
     * grouped under a logical nameservice ID. The configuration keys specific 
     * to them have suffix set to configured nameserviceId.
     * 
     * This method copies the value from specific key of format key.nameserviceId
     * to key, to set up the generic configuration. Once this is done, only
     * generic version of the configuration is read in rest of the code, for
     * backward compatibility and simpler code changes.
     * 
     * @param conf
     *          Configuration object to lookup specific key and to set the value
     *          to the key passed. Note the conf object is modified
     * @see DFSUtil#setGenericConf(Configuration, String, String...)
     */
    public static void initializeGenericKeys(Configuration conf, String serviceKey) {
        if ((serviceKey == null) || serviceKey.isEmpty()) {
            return;
        }
        NameNode.initializeGenericKeys(conf, serviceKey);

        DFSUtil.setGenericConf(conf, serviceKey, AVATARSERVICE_SPECIFIC_KEYS);

        // adjust meta directory names for this service
        adjustMetaDirectoryNames(conf, serviceKey);
    }

    /** Append service name to each avatar meta directory name
     * 
     * @param conf configuration of NameNode
     * @param serviceKey the non-empty name of the name node service
     */
    public static void adjustMetaDirectoryNames(Configuration conf, String serviceKey) {
        adjustMetaDirectoryName(conf, DFS_SHARED_NAME_DIR0_KEY, serviceKey);
        adjustMetaDirectoryName(conf, DFS_SHARED_NAME_DIR1_KEY, serviceKey);
        adjustMetaDirectoryName(conf, DFS_SHARED_EDITS_DIR0_KEY, serviceKey);
        adjustMetaDirectoryName(conf, DFS_SHARED_EDITS_DIR1_KEY, serviceKey);
    }

    @Override
    public void reconfigurePropertyImpl(String property, String newVal) throws ReconfigurationException {
        String expectation = "";
        switch (instance) {
        case NODEZERO:
            expectation = DFS_NAMENODE_RPC_ADDRESS1_KEY;
            break;
        case NODEONE:
            expectation = DFS_NAMENODE_RPC_ADDRESS0_KEY;
            break;
        }
        if (property.equals(expectation)) {
            getConf().set(property, newVal);
            return;
        }
        super.reconfigurePropertyImpl(property, newVal);
    }

    @Override
    public List<String> getReconfigurableProperties() {
        List<String> allProps = super.getReconfigurableProperties();
        switch (instance) {
        case NODEONE:
            allProps.add(DFS_NAMENODE_RPC_ADDRESS0_KEY);
            break;
        case NODEZERO:
            allProps.add(DFS_NAMENODE_RPC_ADDRESS1_KEY);
            break;
        }
        return allProps;
    }

    /**
     * Tries to bind to the address specified in ZooKeeper, this will always fail
     * if the primary is alive either on the same machine or on a remote machine.
     */
    private static void isPrimaryAlive(String zkRegistry) throws IOException {
        String parts[] = zkRegistry.split(":");
        if (parts.length != 2) {
            throw new IllegalArgumentException("Invalid Address : " + zkRegistry);
        }
        String host = parts[0];
        int port = Integer.parseInt(parts[1]);
        InetSocketAddress clientSocket = new InetSocketAddress(host, port);
        ServerSocket socket = new ServerSocket();
        socket.bind(clientSocket);
        socket.close();
    }

    private static void failStartup(String message) throws IOException {
        LOG.error(message);
        throw new IOException(message);
    }

    private static String getWildcardDir(String instance, Configuration conf, String dirKey) {
        String dir = conf.get(dirKey);
        dir = dir.replaceAll(FSConstants.DFS_NAMENODE_NAME_DIR_WILDCARD, instance);
        return dir;
    }

    /**
     * When avatarone and avatarzero both write to the filer in the case where the
     * filer is not the shared directory (when we use QJM). We need both the nodes
     * to write to different directories. For this purpose we configure
     * dfs.name.dir and dfs.name.edits.dir as follows
     *
     * <property>
     * <name>dfs.name.dir</name>
     * <value>/hadoop/<cluster>/,/mnt/fsimage/<cluster>/%</value>
     * </property>
     *
     * Then depending upon the instance we replace '%' with zero or one in this
     * function to ensure both nodes write to different locations.
     */
    private static void processNameDirectories(Configuration conf, InstanceId instanceId) {
        if (instanceId == InstanceId.NODEONE || instanceId == InstanceId.NODEZERO) {
            String instance = (instanceId == InstanceId.NODEZERO) ? "zero" : "one";

            // Edits directory.
            String editDirs = getWildcardDir(instance, conf, FSConstants.DFS_NAMENODE_EDITS_DIR_KEY);
            conf.set(FSConstants.DFS_NAMENODE_EDITS_DIR_KEY, editDirs);

            // Image directory.
            String imageDirs = getWildcardDir(instance, conf, FSConstants.DFS_NAMENODE_NAME_DIR_KEY);
            conf.set(FSConstants.DFS_NAMENODE_NAME_DIR_KEY, imageDirs);
        }
    }

    public static AvatarNode createAvatarNode(String argv[], Configuration conf, RunInfo runInfo)
            throws IOException {
        if (conf == null) {
            conf = new Configuration();
        }

        Configuration startupConf = conf; // save configuration at startup
        StartupInfo startInfo = parseArguments(argv);
        StartupOption startOpt = startInfo.startOpt;

        if (startInfo.instance != null) {
            processNameDirectories(conf, startInfo.instance);
            processNameDirectories(startupConf, startInfo.instance);
        }

        if (startOpt == null) {
            printUsage();
            return null;
        }
        if (!validateServiceName(conf, startInfo.serviceName)) {
            return null;
        }

        initializeGenericKeys(conf, startInfo.serviceName);
        setStartupOption(conf, startOpt);
        conf = updateAddressConf(conf, startInfo.instance);
        NameNode.setupDefaultURI(conf);

        // sync cannot be specified along with format or finalize
        validateStartupOptions(startInfo);

        // We need to check the zookeeper so that the node starting as active
        // is the one registered with the zookeeper
        // and if the node is starting as standby there has to be a master
        // already so that the node doesn't move the log and the image
        String fsname = startupConf.get(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY);
        String actualName = conf.get(NameNode.DFS_NAMENODE_RPC_ADDRESS_KEY);

        String errorMsg = null;
        String zkRegistry = AvatarNodeZkUtil.getPrimaryRegistration(startupConf, startupConf, fsname);

        if (zkRegistry == null) {
            // The registry is empty. Usually this means failover is in progress
            // we need to manually fix it before starting primary
            if (!startInfo.forceStartup) {
                errorMsg = "A zNode that indicates the primary is empty. "
                        + "AvatarNode can only be started as primary if it "
                        + "is registered as primary with ZooKeeper";
                failStartup(errorMsg);
            }
        } else {
            if (!zkRegistry.equalsIgnoreCase(actualName) && !startInfo.isStandby) {
                errorMsg = "Registration information in ZooKeeper doesn't "
                        + "match the address of this node. AvatarNode can "
                        + "only be started as primary if it is registered as "
                        + "primary with ZooKeeper. zkRegistry = " + zkRegistry + ", actual name = " + actualName;
                failStartup(errorMsg);
            }
        }

        if (!startInfo.isStandby && !startInfo.forceStartup) {
            isPrimaryAlive(zkRegistry);
        }

        long ssid = 0;
        // We are the primary avatar, write session Id to ZK.
        if (!startInfo.isStandby) {
            ssid = AvatarNodeZkUtil.writeSessionIdToZK(startupConf);
        }

        // If sync is requested, then we copy only the fsimage
        //  (and not the transaction logs) from the other node. 
        // If we are NODEONE, then modify the configuration to 
        // set fs.name.dir, fs.default.name and dfs.http.address.
        //

        // setup rpc proxy if we are starting as standby
        NamenodeProtocol primaryNamenode = null;
        InetSocketAddress nameNodeAddr = null;
        if (startInfo.isStandby) {
            nameNodeAddr = getRemoteNamenodeAddress(conf, startInfo.instance);
            FLOG.info("Connecting to the primary namenode: " + nameNodeAddr);
            primaryNamenode = (NamenodeProtocol) RPC.waitForProxy(NamenodeProtocol.class,
                    NamenodeProtocol.versionID, nameNodeAddr, conf);
            // make sure we can talk to this primary
            int primaryDataTransferVersion = primaryNamenode.register();
            // check if we have the same data transfer version as primary 
            InjectionHandler.processEventIO(InjectionEvent.AVATARNODE_RECEIVED_DATA_TRANSFER_VERSION,
                    primaryDataTransferVersion);
            int standbyDataTransferVersion = DataTransferProtocol.DATA_TRANSFER_VERSION;
            if (standbyDataTransferVersion != primaryDataTransferVersion && !startInfo.forceStartup) {
                throw new IncorrectVersionException(primaryDataTransferVersion, "data transfer",
                        standbyDataTransferVersion);
            }
        }

        conf = setupAvatarNodeStorage(conf, startInfo, primaryNamenode);

        // namenode options.
        switch (startOpt) {
        case FORMAT:
            boolean aborted = format(conf, false, true);
            System.exit(aborted ? 1 : 0);
        case FORMATFORCE:
            aborted = format(conf, true, false);
            return null;
        case FINALIZE:
            aborted = finalize(conf, true);
            System.exit(aborted ? 1 : 0);
        default:
        }

        // We need to put the Namenode into safemode as soon as it starts up.
        // There is a race condition, where before the Standby AvatarNode can put
        // the NameNode into safemode, the NameNode might leave safemode. This could
        // occur in the case of a start where the FSImage and FSEdits are empty
        // and hence the NameNode doesn't wait at all in safemode.
        if (startInfo.isStandby) {
            conf.setClass("dfs.safemode.impl", StandbySafeMode.class, SafeModeInfo.class);
        }
        // set persisting blocks to be true
        conf.setBoolean("dfs.persist.blocks", true);

        return new AvatarNode(startupConf, conf, startInfo, runInfo, ssid, nameNodeAddr, primaryNamenode);
    }

    private static boolean isFile(URI uri) throws IOException {
        return uri.getScheme().compareTo(JournalType.FILE.name().toLowerCase()) == 0;
    }

    /**
     * Return the configuration that should be used by this instance of AvatarNode
     * Copy fsimages from the remote shared device. 
     */
    static Configuration setupAvatarNodeStorage(Configuration conf, StartupInfo startInfo,
            NamenodeProtocol primaryNamenode) throws IOException {

        // shared loations for image and edits
        URI img0 = NNStorageConfiguration.getURIKey(conf, "dfs.name.dir.shared0");
        URI img1 = NNStorageConfiguration.getURIKey(conf, "dfs.name.dir.shared1");
        URI edit0 = NNStorageConfiguration.getURIKey(conf, "dfs.name.edits.dir.shared0");
        URI edit1 = NNStorageConfiguration.getURIKey(conf, "dfs.name.edits.dir.shared1");

        // local locations for image and edits
        Collection<URI> namedirs = NNStorageConfiguration.getNamespaceDirs(conf, null);
        Collection<URI> editsdir = NNStorageConfiguration.getNamespaceEditsDirs(conf, null);

        // validate correctness of the configuration
        AvatarStorageSetup.validate(conf, namedirs, editsdir, img0, img1, edit0, edit1);

        FileSystem localFs = FileSystem.getLocal(conf).getRaw();

        URI ownSharedImage = null;
        URI ownSharedEdits = null;

        //
        // if we are instance one then copy from primary to secondary
        // otherwise copy from secondary to primary.
        //
        if (startInfo.instance == InstanceId.NODEONE) {
            ownSharedImage = img1;
            ownSharedEdits = edit1;
        } else if (startInfo.instance == InstanceId.NODEZERO) {
            ownSharedImage = img0;
            ownSharedEdits = edit0;
        }

        // allocate a new configuration and update fs.name.dir approprately
        // The shared device should be the first in the list.
        Configuration newconf = new Configuration(conf);
        AvatarStorageSetup.updateConf(startInfo, newconf, namedirs, img0, img1, "dfs.name.dir");

        // update fs.name.edits.dir approprately in the new configuration
        // The shared device should be the first in the list.
        AvatarStorageSetup.updateConf(startInfo, newconf, editsdir, edit0, edit1, "dfs.name.edits.dir");

        // copy fsimage directory if needed
        if (startInfo.isStandby) {

            // do not open edit log at startup
            newconf.setBoolean("dfs.namenode.openlog", false);

            // connect to primary
            String fsName = getRemoteNamenodeHttpName(conf, startInfo.instance);

            FSImage tempImage = new FSImage(newconf, NNStorageConfiguration.getNamespaceDirs(newconf),
                    NNStorageConfiguration.getNamespaceEditsDirs(newconf), null);

            // will block until Primary has left the safemode
            CheckpointSignature cs = getCheckpointSignature(primaryNamenode);

            long lastCheckpointTxId = cs.mostRecentCheckpointTxId;

            if (cs.layoutVersion != FSConstants.LAYOUT_VERSION) {
                throw new IOException("Upgrade for standby is not supported");
            }

            if (isFile(ownSharedImage)) {
                File destFile = new File(ownSharedImage.getPath());
                NNStorageDirectoryRetentionManager.backupFiles(localFs, destFile, conf);
            }
            if (isFile(ownSharedEdits)) {
                File destFile = new File(ownSharedEdits.getPath());
                NNStorageDirectoryRetentionManager.backupFiles(localFs, destFile, newconf);
            }

            // setup storage
            NNStorage tempStorage = tempImage.storage;
            tempStorage.format();
            tempStorage.setStorageInfo(cs);
            tempStorage.writeAll();
            tempImage.editLog.transitionNonFileJournals(tempStorage, false, Transition.FORMAT, null);
            tempImage.transitionNonFileImages(tempStorage, false, Transition.FORMAT);

            // we need to become the active writer to upload image successfully to
            // non-file images storage
            tempImage.editLog.recoverUnclosedStreams();

            // Download the image to all storage directories
            FLOG.info("Downloading image to all storage directories.");
            MD5Hash digest = downloadImageToStorage(fsName, lastCheckpointTxId, tempImage);
            List<StorageDirectory> badSds = new ArrayList<StorageDirectory>();
            tempStorage.checkpointUploadDone(lastCheckpointTxId, digest);
            FLOG.info("Downloading image to all storage directories. DONE");
            tempImage.saveDigestAndRenameCheckpointImage(lastCheckpointTxId, digest);
            tempStorage.reportErrorsOnDirectories(badSds, tempImage);
            tempStorage.close();
            tempImage.close();
        }
        return newconf;
    }

    private static MD5Hash downloadImageToStorage(String fsName, long lastCheckpointTxId, FSImage tempImage)
            throws IOException {
        IOException e = null;
        for (int i = 0; i < 3; i++) {
            try {
                // disable throttling for image download
                return TransferFsImage.downloadImageToStorage(fsName, lastCheckpointTxId, tempImage, true, true);
            } catch (SocketTimeoutException ex) {
                e = ex;
                LOG.info("Downloading image - socked timeout exception. Will retry...");
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ie) {
                    Thread.currentThread().interrupt();
                    break;
                }
            }
        }
        throw e;
    }

    private static CheckpointSignature getCheckpointSignature(NamenodeProtocol primaryNamenode) throws IOException {
        while (true) {
            try {
                return primaryNamenode.getCheckpointSignature();
            } catch (RemoteException e) {
                LOG.info("Active namenode is not available. Standby cannot initialize", e);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }
    }

    public static Configuration updateAddressConf(Configuration conf, InstanceId instance) {
        Configuration newconf = new Configuration(conf);
        // if we are starting as the other namenode, then change the 
        // default URL to make the namenode attach to the appropriate URL
        if (instance == InstanceId.NODEZERO) {
            String fs = conf.get("dfs.http.address0");
            if (fs != null) {
                newconf.set("dfs.http.address", fs);
            }
            fs = conf.get("dfs.namenode.dn-address0");
            if (fs != null) {
                newconf.set("dfs.namenode.dn-address", fs);
            }
            fs = conf.get(AvatarNode.DFS_NAMENODE_RPC_ADDRESS0_KEY);
            if (fs != null) {
                newconf.set(AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY, fs);
                newconf.set("fs.default.name0", fs);
                conf.set("fs.default.name0", fs);
            }
            fs = conf.get("fs.default.name0");
            if (fs != null) {
                newconf.set("fs.default.name", fs);
            }
        }
        if (instance == InstanceId.NODEONE) {
            String fs = conf.get("dfs.http.address1");
            if (fs != null) {
                newconf.set("dfs.http.address", fs);
            }
            fs = conf.get("dfs.namenode.dn-address1");
            if (fs != null) {
                newconf.set("dfs.namenode.dn-address", fs);
            }
            fs = conf.get(AvatarNode.DFS_NAMENODE_RPC_ADDRESS1_KEY);
            if (fs != null) {
                newconf.set(AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY, fs);
                newconf.set("fs.default.name1", fs);
                conf.set("fs.default.name1", fs);
            }
            fs = conf.get("fs.default.name1");
            if (fs != null) {
                newconf.set("fs.default.name", fs);
            }
        }
        return newconf;
    }

    /**
     * Returns the address of the remote namenode
     */
    static InetSocketAddress getRemoteNamenodeAddress(Configuration conf, InstanceId instance) throws IOException {
        String fs = null;
        if (instance == InstanceId.NODEZERO) {
            fs = conf.get(DFS_NAMENODE_RPC_ADDRESS1_KEY);
            if (fs == null)
                fs = conf.get("fs.default.name1");
        } else if (instance == InstanceId.NODEONE) {
            fs = conf.get(DFS_NAMENODE_RPC_ADDRESS0_KEY);
            if (fs == null)
                fs = conf.get("fs.default.name0");
        } else {
            throw new IOException("Unknown instance " + instance);
        }
        if (fs != null) {
            Configuration newConf = new Configuration(conf);
            newConf.set(FSConstants.DFS_NAMENODE_RPC_ADDRESS_KEY, fs);
            conf = newConf;
        }
        return NameNode.getClientProtocolAddress(conf);
    }

    /**
     * Returns the name of the http server of the local namenode
     */
    static String getRemoteNamenodeHttpName(Configuration conf, InstanceId instance) throws IOException {
        if (instance == InstanceId.NODEZERO) {
            return conf.get("dfs.http.address1");
        } else if (instance == InstanceId.NODEONE) {
            return conf.get("dfs.http.address0");
        } else {
            throw new IOException("Unknown instance " + instance);
        }
    }

    /**
     * Return the shared edits file of the remote NameNode
     */
    URI getRemoteSharedEditsURI(Configuration conf) throws IOException {
        return getSharedEditsURI(conf, true);
    }

    /**
     * Return the shared edits file of the local NameNode
     */
    URI getLocalSharedEditsURI(Configuration conf) throws IOException {
        return getSharedEditsURI(conf, false);
    }

    private URI getSharedEditsURI(Configuration conf, boolean remote) throws IOException {
        URI edits = null;
        if (instance == InstanceId.NODEZERO) {

            edits = remote ? NNStorageConfiguration.getURIKey(conf, "dfs.name.edits.dir.shared1")
                    : NNStorageConfiguration.getURIKey(conf, "dfs.name.edits.dir.shared0");
        } else if (instance == InstanceId.NODEONE) {

            edits = remote ? NNStorageConfiguration.getURIKey(conf, "dfs.name.edits.dir.shared0")
                    : NNStorageConfiguration.getURIKey(conf, "dfs.name.edits.dir.shared1");
        } else {
            LOG.info("Instance is invalid. " + instance);
            throw new IOException("Instance is invalid. " + instance);
        }
        return edits;
    }

    /**
     * Returns the starting checkpoint time of this AvatarNode
     */
    long getStartCheckpointTxId() {
        return startCheckpointTxId;
    }

    void setStartCheckpointTxId(long txid) {
        startCheckpointTxId = txid;
    }

    /**
     * Indicates that the AvatarNode shoudl restart
     */
    void doRestart() {
        runInfo.doRestart = true;
    }

    /**
     * Current system time.
     * @return current time in msec.
     */
    static long now() {
        return System.currentTimeMillis();
    }

    protected Map<NameNodeKey, String> getNameNodeSpecificKeys() {
        Map<NameNodeKey, String> map = new HashMap<NameNodeKey, String>();
        try {

            map.put(new NameNodeKey("Last applied transaction id", NameNodeKey.BOTH),
                    toStr(getFSImage().getEditLog().getLastWrittenTxId()));

            if (currentAvatar == Avatar.STANDBY) {
                map.put(new NameNodeKey("Standby: ignore datanodes", NameNodeKey.STANDBY),
                        toStr(this.ignoreDatanodes()));
                map.put(new NameNodeKey("Standby: ingest state", NameNodeKey.STANDBY),
                        toStr((standby == null) ? "" : standby.currentIngestState));
                map.put(new NameNodeKey("Standby: ingest fell behind", NameNodeKey.STANDBY),
                        toStr((standby == null) ? "" : standby.fellBehind()));
                map.put(new NameNodeKey("Standby: ingest lag bytes", NameNodeKey.STANDBY),
                        toStr((standby == null) ? 0L : standby.getLagBytes()));
                map.put(new NameNodeKey("Standby: checkpoint status", NameNodeKey.STANDBY),
                        toStr((standby == null) ? "" : standby.getCheckpointStatus()));
                map.put(new NameNodeKey("Standby: failover in progress", NameNodeKey.STANDBY),
                        toStr(standbySafeMode.failoverInProgress()));
                if (standbySafeMode.failoverInProgress()) {
                    map.put(new NameNodeKey("Standby: failover outstanding heartbeats", NameNodeKey.STANDBY),
                            toStr(standbySafeMode.getOutStandingHeartbeats().size()));
                    map.put(new NameNodeKey("Standby: failover outstanding reports", NameNodeKey.STANDBY),
                            toStr(standbySafeMode.getOutStandingReports().size()));
                }

            } else {
                map.put(new NameNodeKey("Last checkpoint txid", NameNodeKey.ACTIVE),
                        toStr(this.getFSImage().storage.getMostRecentCheckpointTxId()));
                map.put(new NameNodeKey("Last checkpoint time", NameNodeKey.ACTIVE),
                        this.getFSImage().storage.getMostRecentCheckpointTime());

            }
        } catch (Exception e) {
            // send partial information
            LOG.error(e.toString());
        }
        return map;
    }

    protected boolean getIsPrimary() {
        return currentAvatar == Avatar.ACTIVE;
    }

    private String toStr(Object o) {
        return o.toString();
    }

    public static class RunInfo {
        volatile boolean doRestart;
        volatile boolean shutdown;
        volatile boolean isRunning;

        public RunInfo(boolean doRestart, boolean shutdown, boolean isRunning) {
            this.doRestart = doRestart;
            this.shutdown = shutdown;
            this.isRunning = isRunning;
        }

        public RunInfo() {
            this.doRestart = false;
            this.shutdown = false;
            this.isRunning = true;
        }

    }

    public InetSocketAddress getNameNodeAddress() {
        return serverAddress;
    }

    public StandbySafeMode getStandbySafeMode() {
        return this.standbySafeMode;
    }

    public InstanceId getInstanceId() {
        return this.instance;
    }

    /**
     */
    public static void main(String argv[]) throws Exception {
        org.apache.hadoop.hdfs.DnsMonitorSecurityManager.setTheManager();
        Exception exception = null;
        AvatarNode avatarnode = null;
        RunInfo runInfo = new RunInfo();
        do {
            runInfo.doRestart = false;
            runInfo.isRunning = true;
            exception = null;
            try {
                StringUtils.startupShutdownMessage(AvatarNode.class, argv, LOG);
                FastWritableHDFS.init();
                FastProtocolHDFS.init();
                avatarnode = createAvatarNode(argv, null, runInfo);
                if (avatarnode != null) {
                    avatarnode.waitForRestart();
                }
            } catch (Throwable e) {
                LOG.error(StringUtils.stringifyException(e));
                if (runInfo.doRestart) {
                    LOG.error("AvatarNode restarting...");
                } else {
                    exception = new Exception(StringUtils.stringifyException(e));
                }
            }
        } while (runInfo.doRestart == true);
        if (runInfo.shutdown) {
            avatarnode.stopRPC(true);
        }
        if (exception != null) {
            LOG.fatal("Exception running avatarnode. Shutting down", exception);
            Runtime.getRuntime().exit(1);
        }
    }

    public boolean isInitDone() {
        return isInitialized;
    }
}