org.apache.hadoop.hdfs.server.datanode.AvatarDataNode.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.datanode.AvatarDataNode.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.datanode;

import static org.apache.hadoop.hdfs.server.namenode.NameNode.DATANODE_PROTOCOL_ADDRESS;

import java.io.File;
import java.io.IOException;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.net.NoRouteToHostException;
import java.net.PortUnreachableException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.AvatarZooKeeperClient;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.FastProtocolHDFS;
import org.apache.hadoop.hdfs.FastWritableHDFS;
import org.apache.hadoop.hdfs.protocol.AvatarProtocol;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.UnregisteredDatanodeException;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.data.Stat;

/**
 * This is an implementation of the AvatarDataNode, a wrapper
 * for a regular datanode that works with AvatarNode.
 * 
 * The AvatarDataNode is needed to make a vanilla DataNode send
 * block reports to Primary and standby namenodes. The AvatarDataNode
 * does not know which one of the namenodes is primary and which is
 * secondary.
 *
 * Typically, an adminstrator will have to specify the pair of
 * AvatarNodes via fs1.default.name and fs2.default.name
 *
 */

public class AvatarDataNode extends DataNode {

    static {
        Configuration.addDefaultResource("avatar-default.xml");
        Configuration.addDefaultResource("avatar-site.xml");
    }
    public static final Log LOG = LogFactory.getLog(AvatarDataNode.class.getName());

    public AvatarDataNode(Configuration conf, AbstractList<File> dataDirs, String dnThreadName) throws IOException {
        super(conf, dataDirs);

        AvatarDataNode.dnThreadName = dnThreadName;
    }

    @Override
    void startDataNode(Configuration conf, AbstractList<File> dataDirs) throws IOException {
        initGlobalSetting(conf, dataDirs);
        Collection<String> serviceIds = DFSUtil.getNameServiceIds(conf);
        List<String> defaultAddresses = getZnodePaths(serviceIds, conf);
        List<InetSocketAddress> nameAddrs0 = DFSUtil.getRPCAddresses("0", conf, serviceIds,
                DATANODE_PROTOCOL_ADDRESS, DFS_NAMENODE_RPC_ADDRESS_KEY);
        List<InetSocketAddress> nameAddrs1 = DFSUtil.getRPCAddresses("1", conf, serviceIds,
                DATANODE_PROTOCOL_ADDRESS, DFS_NAMENODE_RPC_ADDRESS_KEY);
        List<InetSocketAddress> avatarAddrs0 = AvatarDataNode.getAvatarNodeAddresses("0", conf, serviceIds);
        List<InetSocketAddress> avatarAddrs1 = AvatarDataNode.getAvatarNodeAddresses("1", conf, serviceIds);

        namespaceManager = new AvatarNamespaceManager(nameAddrs0, nameAddrs1, avatarAddrs0, avatarAddrs1,
                defaultAddresses, DFSUtil.getNameServiceIds(conf));

        initDataSetAndScanner(conf, dataDirs, nameAddrs0.size());
    }

    @Override
    protected void notifyNamenodeReceivedBlock(int namespaceId, Block block, String delHint) throws IOException {
        if (block == null) {
            throw new IllegalArgumentException("Block is null");
        }
        ((AvatarNamespaceManager) namespaceManager).notifyNamenodeReceivedBlock(namespaceId, block, delHint);
    }

    @Override
    protected void notifyNamenodeDeletedBlock(int namespaceId, Block block) throws IOException {
        if (block == null) {
            throw new IllegalArgumentException("Block is null");
        }
        ((AvatarNamespaceManager) namespaceManager).notifyNamenodeDeletedBlock(namespaceId, block);
    }

    /** TODO: will add more details to this later on
     * Manages OfferService objects for the data node namespaces.
     * Each namespace has two OfferServices, one for pirmary and one for standby.
     * Creation, removal, starting, stopping, shutdown on OfferService
     * objects must be done via APIs in this class.
     */
    class AvatarNamespaceManager extends NamespaceManager {
        private final Object refreshNamenodesLock = new Object();

        AvatarNamespaceManager(List<InetSocketAddress> nameAddrs0, List<InetSocketAddress> nameAddrs1,
                List<InetSocketAddress> avatarAddrs0, List<InetSocketAddress> avatarAddrs1,
                List<String> defaultAddrs, Collection<String> nameserviceIds) throws IOException {
            Iterator<String> it = nameserviceIds.iterator();
            for (int i = 0; i < nameAddrs0.size(); i++) {
                InetSocketAddress nameAddr0 = nameAddrs0.get(i);
                String nameserviceId = it.hasNext() ? it.next() : null;
                nameNodeThreads.put(nameAddr0, new ServicePair(nameAddr0, nameAddrs1.get(i), avatarAddrs0.get(i),
                        avatarAddrs1.get(i), defaultAddrs.get(i), nameserviceId));
            }

        }

        /**
         * Notify both namenode(s) that we have received a block
         */
        protected void notifyNamenodeReceivedBlock(int namespaceId, Block block, String delHint)
                throws IOException {
            NamespaceService servicePair = get(namespaceId);
            if (servicePair == null) {
                throw new IOException("Cannot locate OfferService thread for namespace=" + namespaceId);
            }
            servicePair.notifyNamenodeReceivedBlock(block, delHint);
        }

        /**
         * Notify both namenode(s) that we have deleted a block
         */
        protected void notifyNamenodeDeletedBlock(int namespaceId, Block block) throws IOException {
            NamespaceService servicePair = this.get(namespaceId);
            if (servicePair == null) {
                throw new IOException("Cannot locate OfferService thread for namespace=" + namespaceId);
            }
            servicePair.notifyNamenodeDeletedBlock(block);
        }

        void refreshNamenodes(List<InetSocketAddress> nameAddrs0, List<InetSocketAddress> nameAddrs1,
                List<InetSocketAddress> avatarAddrs0, List<InetSocketAddress> avatarAddrs1,
                List<String> defaultAddrs, Collection<String> nameserviceIds)
                throws IOException, InterruptedException {
            List<Integer> toStart = new ArrayList<Integer>();
            List<String> toStartNameserviceIds = new ArrayList<String>();
            List<NamespaceService> toStop = new ArrayList<NamespaceService>();
            List<InetSocketAddress> toStopNNs = new ArrayList<InetSocketAddress>();
            synchronized (refreshNamenodesLock) {
                synchronized (this) {
                    for (InetSocketAddress nnAddr : nameNodeThreads.keySet()) {
                        if (!nameAddrs0.contains(nnAddr)) {
                            LOG.info("To remove service at " + nnAddr);
                            toStopNNs.add(nnAddr);
                        }
                    }
                    for (InetSocketAddress nnAddr : toStopNNs) {
                        NamespaceService ns = remove(nnAddr);
                        if (ns != null) {
                            LOG.info("Removing service: " + nnAddr);
                            toStop.add(ns);
                        }
                    }
                    Iterator<String> it = nameserviceIds.iterator();
                    for (int i = 0; i < nameAddrs0.size(); i++) {
                        String nameserviceId = it.hasNext() ? it.next() : null;
                        InetSocketAddress nnAddr = nameAddrs0.get(i);
                        if (!nameNodeThreads.containsKey(nnAddr)) {
                            LOG.info("Adding service " + nameserviceId + " at " + nnAddr);
                            toStart.add(i);
                            toStartNameserviceIds.add(nameserviceId);
                        }
                    }
                    it = toStartNameserviceIds.iterator();
                    for (Integer i : toStart) {
                        InetSocketAddress nameAddr0 = nameAddrs0.get(i);
                        nameNodeThreads.put(nameAddr0, new ServicePair(nameAddr0, nameAddrs1.get(i),
                                avatarAddrs0.get(i), avatarAddrs1.get(i), defaultAddrs.get(i), it.next()));
                    }
                }
                for (NamespaceService nsos : toStop) {
                    nsos.stop();
                }
                for (NamespaceService nsos : toStop) {
                    nsos.join();
                }
                startAll();
            }
        }

        /**
         * Refreshes the corresponding offer service if there were any
         * changes for that avatarnode in the config file.
         * 
         * @param zeroOrOne
         *          whether to refresh for AvatarZero or AvatarOne
         */
        void refreshOfferService(InetSocketAddress nameAddrs0, InetSocketAddress nameAddrs1,
                InetSocketAddress avatarAddrs0, InetSocketAddress avatarAddrs1, String serviceName)
                throws IOException {
            LOG.info("OfferService refresh called.");
            synchronized (refreshNamenodesLock) {
                synchronized (this) {
                    Collection<NamespaceService> allServices = nameNodeThreads.values();
                    NamespaceService theServicePairToUpdate = null;
                    for (NamespaceService aService : allServices) {
                        if (aService.getNameserviceId().equalsIgnoreCase(serviceName)) {
                            theServicePairToUpdate = aService;
                            break;
                        }
                    }
                    if (theServicePairToUpdate == null) {
                        throw new IOException("Invalid service name.");
                    }
                    boolean wasZeroRefreshed = false;
                    ServicePair toBeRefreshed = (ServicePair) theServicePairToUpdate;
                    if (!(nameAddrs0.equals(toBeRefreshed.nameAddr1)
                            && avatarAddrs0.equals(toBeRefreshed.avatarAddr1))) {
                        LOG.info("Refreshing offer service to node zero for service: " + serviceName);
                        logChangeOf(toBeRefreshed.nameAddr1, nameAddrs0);
                        logChangeOf(toBeRefreshed.avatarAddr1, avatarAddrs0);
                        toBeRefreshed.restartServiceZeroWith(nameAddrs0, avatarAddrs0);
                        remapNameservice(toBeRefreshed.nameAddr1, nameAddrs0);
                        wasZeroRefreshed = true;
                    }
                    boolean wasOneRefreshed = false;
                    if (!(nameAddrs1.equals(toBeRefreshed.nameAddr2)
                            && avatarAddrs1.equals(toBeRefreshed.avatarAddr2))) {
                        LOG.info("Refreshing offer service to node zero for service: " + serviceName);
                        logChangeOf(toBeRefreshed.nameAddr2, nameAddrs1);
                        logChangeOf(toBeRefreshed.avatarAddr2, avatarAddrs1);
                        toBeRefreshed.restartServiceOneWith(nameAddrs1, avatarAddrs1);
                        wasOneRefreshed = true;
                    }
                    if (!wasZeroRefreshed && !wasOneRefreshed) {
                        LOG.warn("Neither of the offerservices were refreshed");
                    }
                }
            }
        }
    }

    private static void logChangeOf(InetSocketAddress prev, InetSocketAddress next) {
        if (prev.equals(next)) {
            return;
        }
        LOG.info("From: <" + prev + "> To: <" + next + ">");
    }

    public class ServicePair extends NamespaceService {
        String defaultAddr;

        InetSocketAddress nameAddr1;
        InetSocketAddress nameAddr2;
        DatanodeProtocol namenode1;
        DatanodeProtocol namenode2;
        AvatarProtocol avatarnode1;
        AvatarProtocol avatarnode2;
        InetSocketAddress avatarAddr1;
        InetSocketAddress avatarAddr2;

        boolean doneRegister1 = false; // not yet registered with namenode1
        boolean doneRegister2 = false; // not yet registered with namenode2

        OfferService offerService1;
        OfferService offerService2;

        volatile OfferService primaryOfferService = null;
        volatile InetSocketAddress primaryAddr = null;
        Thread of1;
        Thread of2;

        // the registration layout version is matching
        volatile boolean currentRegistrationLayoutMatch1 = true;
        volatile boolean currentRegistrationLayoutMatch2 = true;

        int namespaceId;
        String nameserviceId;
        Thread spThread;
        AvatarZooKeeperClient zkClient;

        private NamespaceInfo nsInfo;
        DatanodeRegistration nsRegistration;
        private UpgradeManagerDatanode upgradeManager;
        private volatile boolean initialized = false;
        private volatile boolean shouldServiceRun = true;
        volatile long lastBeingAlive = now();

        private ServicePair(InetSocketAddress nameAddr1, InetSocketAddress nameAddr2, InetSocketAddress avatarAddr1,
                InetSocketAddress avatarAddr2, String defaultAddr, String nameserviceId) {
            this.nameAddr1 = nameAddr1;
            this.nameAddr2 = nameAddr2;
            this.avatarAddr1 = avatarAddr1;
            this.avatarAddr2 = avatarAddr2;
            this.defaultAddr = defaultAddr;
            this.nameserviceId = nameserviceId;
            zkClient = new AvatarZooKeeperClient(getConf(), null);
            this.nsRegistration = new DatanodeRegistration(getMachineName());
        }

        private void setNamespaceInfo(NamespaceInfo nsinfo) {
            this.nsInfo = nsinfo;
            this.namespaceId = nsinfo.getNamespaceID();
            namespaceManager.addNamespace(this);
        }

        private void setupNS() throws IOException {
            // handshake with NN
            NamespaceInfo nsInfo;
            nsInfo = handshake(true);
            setNamespaceInfo(nsInfo);
            synchronized (AvatarDataNode.this) {
                setupNSStorage();
            }

            nsRegistration.setIpcPort(ipcServer.getListenerAddress().getPort());
            nsRegistration.setInfoPort(infoServer.getPort());
        }

        private void setupNSStorage() throws IOException {
            Configuration conf = getConf();
            StartupOption startOpt = getStartupOption(conf);
            assert startOpt != null : "Startup option must be set.";

            boolean simulatedFSDataset = conf.getBoolean("dfs.datanode.simulateddatastorage", false);

            if (simulatedFSDataset) {
                nsRegistration.setStorageID(storage.getStorageID()); //same as DN
                nsRegistration.storageInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
                nsRegistration.storageInfo.namespaceID = nsInfo.namespaceID;
            } else {
                // read storage info, lock data dirs and transition fs state if necessary      
                // first do it at the top level dataDirs
                // This is done only once when among all namespaces
                storage.recoverTransitionRead(AvatarDataNode.this, nsInfo, dataDirs, startOpt);
                // Then do it for this namespace's directory
                storage.recoverTransitionRead(AvatarDataNode.this, nsInfo.namespaceID, nsInfo, dataDirs, startOpt,
                        nameserviceId);

                LOG.info("setting up storage: namespaceId=" + namespaceId + ";lv=" + storage.layoutVersion
                        + ";nsInfo=" + nsInfo);

                nsRegistration.setStorageInfo(storage.getNStorage(nsInfo.namespaceID), storage.getStorageID());
                data.initialize(storage);

            }
            data.addNamespace(namespaceId, storage.getNameSpaceDataDir(namespaceId), conf);
            if (blockScanner != null) {
                blockScanner.start();
                blockScanner.addNamespace(namespaceId);
            }
        }

        @Override
        public UpgradeManagerDatanode getUpgradeManager() {
            synchronized (AvatarDataNode.this) {
                if (upgradeManager == null)
                    upgradeManager = new UpgradeManagerDatanode(AvatarDataNode.this, namespaceId);
            }
            return upgradeManager;
        }

        public void processUpgradeCommand(UpgradeCommand comm) throws IOException {
            assert upgradeManager != null : "DataNode.upgradeManager is null.";
            upgradeManager.processUpgradeCommand(comm);
        }

        /**
         * Start distributed upgrade if it should be initiated by the data-node.
         */
        private void startDistributedUpgradeIfNeeded() throws IOException {
            UpgradeManagerDatanode um = getUpgradeManager();

            if (!um.getUpgradeState())
                return;
            um.setUpgradeState(false, um.getUpgradeVersion());
            um.startUpgrade();
            return;
        }

        public void start() {
            if ((spThread != null) && (spThread.isAlive())) {
                //Thread is started already
                return;
            }
            LOG.info("start service " + this.nameserviceId);
            spThread = new Thread(this, dnThreadName + " for namespace " + namespaceId);
            spThread.setDaemon(true);
            spThread.start();

        }

        public void stop() {
            stopServices();
            if (spThread != null) {
                spThread.interrupt();
            }
        }

        private void initProxy1() throws IOException {
            synchronized (avatarAddr1) {
                if (namenode1 == null) {
                    namenode1 = (DatanodeProtocol) RPC.getProxy(DatanodeProtocol.class, DatanodeProtocol.versionID,
                            nameAddr1, getConf());
                }

                if (avatarnode1 == null) {
                    avatarnode1 = (AvatarProtocol) RPC.getProxy(AvatarProtocol.class, AvatarProtocol.versionID,
                            avatarAddr1, getConf());
                }
            }
        }

        private void initProxy2() throws IOException {
            InjectionHandler.processEventIO(InjectionEvent.OFFERSERVICE_BEFORE_INIT_PROXY2, nameAddr2, avatarAddr2);

            synchronized (avatarAddr2) {
                if (namenode2 == null) {
                    namenode2 = (DatanodeProtocol) RPC.getProxy(DatanodeProtocol.class, DatanodeProtocol.versionID,
                            nameAddr2, getConf());
                }
                if (avatarnode2 == null) {
                    avatarnode2 = (AvatarProtocol) RPC.getProxy(AvatarProtocol.class, AvatarProtocol.versionID,
                            avatarAddr2, getConf());
                }
            }
        }

        private void restartServiceZeroWith(InetSocketAddress namenodeZero, InetSocketAddress avatarZero)
                throws IOException {
            synchronized (avatarAddr1) {
                nameAddr1 = namenodeZero;
                avatarAddr1 = avatarZero;
            }
            restartService1();
        }

        private void restartServiceOneWith(InetSocketAddress namenodeOne, InetSocketAddress avatarOne)
                throws IOException {
            synchronized (avatarAddr2) {
                nameAddr2 = namenodeOne;
                avatarAddr2 = avatarOne;
            }
            restartService2();
        }

        public void restartService1() throws IOException {
            // Rely on handshake to restart the service.
            synchronized (avatarAddr1) {
                stopService1();
                joinService1();
                doneRegister1 = false;
            }
        }

        void stopService1() {
            RPC.stopProxy(avatarnode1);
            RPC.stopProxy(namenode1);
            avatarnode1 = null;
            namenode1 = null;
            if (offerService1 != null) {
                offerService1.stop();
            }
            if (of1 != null) {
                of1.interrupt();
            }
        }

        void stopService2() {
            RPC.stopProxy(avatarnode2);
            RPC.stopProxy(namenode2);
            avatarnode2 = null;
            namenode2 = null;
            if (offerService2 != null) {
                offerService2.stop();
            }
            if (of2 != null) {
                of2.interrupt();
            }
        }

        private void joinService1() {
            if (of1 != null) {
                try {
                    of1.join();
                } catch (InterruptedException ie) {
                    Thread.currentThread().interrupt();
                }
            }
        }

        private void joinService2() {
            if (of2 != null) {
                try {
                    of2.join();
                } catch (InterruptedException ie) {
                    Thread.currentThread().interrupt();
                }
            }
        }

        public void restartService2() throws IOException {
            // Rely on handshake to restart the service.
            synchronized (avatarAddr2) {
                stopService2();
                joinService2();
                doneRegister2 = false;
            }
        }

        /** stop two offer services */
        private void stopServices() {
            this.shouldServiceRun = false;
            LOG.info("stop services " + this.nameserviceId);
            stopService1();
            stopService2();
            if (zkClient != null) {
                try {
                    zkClient.shutdown();
                } catch (InterruptedException ie) {
                    LOG.warn("Zk shutdown is interrupted: ", ie);
                }
            }
        }

        public void join() {
            joinServices();
            if (spThread != null) {
                try {
                    spThread.join();
                } catch (InterruptedException ie) {
                    Thread.currentThread().interrupt();
                }
                spThread = null;
            }
        }

        /** Join two offer services */
        private void joinServices() {
            joinService1();
            joinService2();
        }

        public void cleanUp() {
            if (upgradeManager != null)
                upgradeManager.shutdownUpgrade();

            namespaceManager.remove(this.getNNSocketAddress());
            shouldServiceRun = false;
            try {
                RPC.stopProxy(namenode1);
            } catch (Exception e) {
                LOG.warn("Exception stop the namenode RPC threads", e);
            }
            try {
                RPC.stopProxy(namenode2);
            } catch (Exception e) {
                LOG.warn("Exception stop the namenode RPC threads", e);
            }
            if (blockScanner != null) {
                blockScanner.removeNamespace(this.getNamespaceId());
            }
            if (data != null) {
                data.removeNamespace(this.getNamespaceId());
            }
            if (storage != null) {
                storage.removeNamespaceStorage(this.getNamespaceId());
            }
        }

        public void shutdown() {
            stop();
            join();
        }

        // connect to both name node if possible. 
        // If doWait is true, then return only when at least one handshake is
        // successful.
        //
        private NamespaceInfo handshake(boolean startup) throws IOException {
            NamespaceInfo nsInfo = null;
            boolean firstIsPrimary = false;
            // When true indicates ZK is null and there is no primary. This is to
            // enable datanode startups during failover. The assumption is that the
            // layout version of the Standby and Primary would be consistent when
            // we failover and hence we can speak to any one of the nodes to find out
            // the NamespaceInfo.
            boolean noPrimary = false;

            do {
                if (startup) {
                    // The startup option is used when the datanode is first created
                    // We only need to connect to the primary at this point and as soon
                    // as possible. So figure out who the primary is from the ZK
                    try {
                        getPrimaryAddr();
                        noPrimary = (this.primaryAddr == null);
                        firstIsPrimary = nameAddr1.equals(primaryAddr);
                    } catch (Exception ex) {
                        LOG.error("Could not get the primary address from ZooKeeper", ex);
                    }
                }
                try {
                    if ((firstIsPrimary && startup) || !startup || noPrimary) {
                        // only try to connect to the first NN if it is not the
                        // startup connection or if it is primary on startup
                        // This way if it is standby we are not wasting datanode startup
                        // time
                        initProxy1();
                        if (startup) {
                            nsInfo = handshake(namenode1, nameAddr1);
                        }
                    }
                } catch (ConnectException se) { // namenode has not been started
                    LOG.info("Server at " + nameAddr1 + " not available yet, Zzzzz...");
                } catch (NoRouteToHostException nrhe) {
                    LOG.info("NoRouteToHostException connecting to server. " + nameAddr1, nrhe);
                } catch (PortUnreachableException pue) {
                    LOG.info("PortUnreachableException connecting to server. " + nameAddr1, pue);
                } catch (UnknownHostException uhe) {
                    LOG.info("UnknownHostException connecting to server. " + nameAddr1, uhe);
                } catch (SocketTimeoutException te) { // namenode is busy
                    LOG.info("Problem connecting to server timeout. " + nameAddr1);
                } catch (IOException ioe) {
                    LOG.info("Problem connecting to server. " + nameAddr1, ioe);
                }
                try {
                    if ((!firstIsPrimary && startup) || !startup || noPrimary) {
                        initProxy2();
                        if (startup) {
                            NamespaceInfo tempInfo = handshake(namenode2, nameAddr2);
                            // During failover both layouts should match.
                            if (noPrimary && nsInfo != null
                                    && tempInfo.getLayoutVersion() != nsInfo.getLayoutVersion()) {
                                throw new IOException("Layout versions don't match on zero, one: "
                                        + nsInfo.getLayoutVersion() + ", " + tempInfo.getLayoutVersion());
                            }
                            nsInfo = tempInfo;
                        }
                    }
                } catch (ConnectException se) { // namenode has not been started
                    LOG.info("Server at " + nameAddr2 + " not available yet, Zzzzz...");
                } catch (NoRouteToHostException nrhe) {
                    LOG.info("NoRouteToHostException connecting to server. " + nameAddr2, nrhe);
                } catch (PortUnreachableException pue) {
                    LOG.info("PortUnreachableException connecting to server. " + nameAddr2, pue);
                } catch (UnknownHostException uhe) {
                    LOG.info("UnknownHostException connecting to server. " + nameAddr2, uhe);
                } catch (SocketTimeoutException te) { // namenode is busy
                    LOG.info("Problem connecting to server timeout. " + nameAddr2);
                } catch (RemoteException re) {
                    handleRegistrationError(re, nameAddr2);
                } catch (IOException ioe) {
                    LOG.info("Problem connecting to server. " + nameAddr2, ioe);
                }
            } while (startup && nsInfo == null && shouldServiceRun);
            return nsInfo;
        }

        private NamespaceInfo handshake(DatanodeProtocol node, InetSocketAddress machine) throws IOException {
            NamespaceInfo nsInfo = null;
            while (shouldServiceRun) {
                try {
                    nsInfo = node.versionRequest();
                    break;
                } catch (SocketTimeoutException e) { // namenode is busy
                    LOG.info("Problem connecting to server: " + machine);
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException ie) {
                    }
                }
            }
            if (nsInfo == null) { // because shouldServiceRun becomes false
                return null;
            }
            LOG.info("Handshake with namenode server: " + machine);
            String errorMsg = null;
            // do not fail on incompatible build version
            if (!nsInfo.getBuildVersion().equals(Storage.getBuildVersion())) {
                errorMsg = "Incompatible build versions: namenode BV = " + nsInfo.getBuildVersion()
                        + "; datanode BV = " + Storage.getBuildVersion();
                LOG.warn(errorMsg);
            }
            if (FSConstants.LAYOUT_VERSION < nsInfo.getLayoutVersion()) {
                // datanode has a newer layout version - allowed
                LOG.warn("Datanode has newer layout versions than namenode: namenode LV = "
                        + nsInfo.getLayoutVersion() + "; datanode BV = " + FSConstants.LAYOUT_VERSION
                        + " Will continue assuming data node version.");

                nsInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
                // to indicate that the upgrade should not be finalized
                // until we register to namenode with matching LV
                setRegistrationMatch(machine, false);
            } else if (FSConstants.LAYOUT_VERSION > nsInfo.getLayoutVersion()) {
                // namenode has newer layout version - disallowed
                errorMsg = "Datanode has older layout versions than namenode: namenode LV = "
                        + nsInfo.getLayoutVersion() + "; datanode BV = " + FSConstants.LAYOUT_VERSION
                        + " Datanode will shut down. namenode server: " + machine;
                LOG.fatal(errorMsg);
                try {
                    node.errorReport(nsRegistration, DatanodeProtocol.NOTIFY, errorMsg);
                } catch (SocketTimeoutException e) { // namenode is busy        
                    LOG.info("Problem connecting to server: " + machine);
                }
                shutdownDN();
                throw new IOException(errorMsg);
            } else {
                // versions are matching
                // so we can process finalize upgrade commands
                // offer service will discard standby DNA_FINALIZE on its own
                setRegistrationMatch(machine, true);
            }
            return nsInfo;
        }

        /**
         * Returns true if we are able to successfully register with namenode
         */
        boolean register(DatanodeProtocol node, InetSocketAddress machine, boolean dnaRegister) throws IOException {
            if (nsRegistration.getStorageID().equals("")) {
                setNewStorageID(nsRegistration);
            }

            DatanodeRegistration tmp = new DatanodeRegistration(nsRegistration.getName());
            tmp.setInfoPort(nsRegistration.getInfoPort());
            tmp.setIpcPort(nsRegistration.getIpcPort());
            boolean simulatedFSDataset = conf.getBoolean("dfs.datanode.simulateddatastorage", false);
            if (simulatedFSDataset) {
                tmp.setStorageID(storage.getStorageID()); //same as DN
                tmp.storageInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
                tmp.storageInfo.namespaceID = nsInfo.namespaceID;
            } else {
                tmp.setStorageInfo(storage.getNStorage(namespaceId), storage.getStorageID());
            }

            // reset name to machineName. Mainly for web interface.
            tmp.name = machineName + ":" + nsRegistration.getPort();
            try {
                tmp = node.register(tmp, DataTransferProtocol.DATA_TRANSFER_VERSION);
                // if we successded registering for the first time, then we update
                // the global registration objct
                if (!doneRegister1 && !doneRegister2) {
                    nsRegistration = tmp;
                }
            } catch (SocketTimeoutException e) { // namenode is busy
                LOG.info("Problem connecting to server: " + machine);
                return false;
            }

            assert ("".equals(storage.getStorageID()) && !"".equals(nsRegistration.getStorageID()))
                    || storage.getStorageID().equals(nsRegistration
                            .getStorageID()) : "New storageID can be assigned only if data-node is not formatted";
            if (storage.getStorageID().equals("")) {
                storage.setStorageID(nsRegistration.getStorageID());
                storage.writeAll();
                LOG.info("New storage id " + nsRegistration.getStorageID() + " is assigned to data-node "
                        + nsRegistration.getName());
            }
            if (!storage.getStorageID().equals(nsRegistration.getStorageID())) {
                throw new IOException("Inconsistent storage IDs. Name-node returned "
                        + nsRegistration.getStorageID() + ". Expecting " + storage.getStorageID());
            }

            // offerservice got DNA_REGISTER, so we might be talking to upgraded namenode
            // do the handshake again
            if (!getRegistrationMatch(machine) && dnaRegister) {
                handshake(node, machine);
            }

            sendBlocksBeingWrittenReport(node, namespaceId, nsRegistration);
            return true;
        }

        boolean isPrimaryOfferService(OfferService service) {
            return primaryOfferService == service;
        }

        /**
         * Return true if the last registration for the given offer service
         * had matching layout version, false otherwise.
         */
        boolean shouldProcessFinalizeCommand(OfferService service) throws IOException {
            if (service == offerService1) {
                return currentRegistrationLayoutMatch1;
            }
            if (service == offerService2) {
                return currentRegistrationLayoutMatch2;
            } else {
                throw new IOException("Offer service not known!");
            }
        }

        /**
         * Sets current registration matching layout for the given
         * namenode address.
         */
        void setRegistrationMatch(InetSocketAddress nameNodeAddr, boolean value) throws IOException {
            if (nameAddr1.equals(nameNodeAddr)) {
                currentRegistrationLayoutMatch1 = value;
            } else if (nameAddr2.equals(nameNodeAddr)) {
                currentRegistrationLayoutMatch2 = value;
            } else {
                throw new IOException("Machine : " + nameNodeAddr + " is not configured as namenode");
            }
        }

        boolean getRegistrationMatch(InetSocketAddress nameNodeAddr) {
            if (nameAddr1.equals(nameNodeAddr)) {
                return currentRegistrationLayoutMatch1;
            } else {
                return currentRegistrationLayoutMatch2;
            }
        }

        void setPrimaryOfferService(OfferService service) {
            this.primaryOfferService = service;
            if (service != null)
                LOG.info("Primary namenode is set to be " + service.avatarnodeAddress);
            else {
                LOG.info("Failover has happened. Stop accessing commands from "
                        + "either namenode until the new primary is completely in" + "sync with all the datanodes");
            }
        }

        boolean isPrimary(InetSocketAddress namenodeAddress) throws InterruptedException {
            getPrimaryAddr();
            if (this.primaryAddr == null) {
                return false;
            }
            return this.primaryAddr.equals(namenodeAddress);
        }

        private void getPrimaryAddr() throws InterruptedException {
            try {
                Stat stat = new Stat();
                this.primaryAddr = NetUtils
                        .createSocketAddr(this.zkClient.getPrimaryAvatarAddress(this.defaultAddr, stat, false));
            } catch (InterruptedException ie) {
                throw ie;
            } catch (Exception ex) {
                LOG.error("Could not get the primary from ZooKeeper", ex);
                this.primaryAddr = null;
            }
        }

        void handleRegistrationError(RemoteException re, InetSocketAddress failedNode) {
            // If either the primary or standby NN throws these exceptions, this
            // datanode will exit. I think this is the right behaviour because
            // the excludes list on both namenode better be the same.
            String reClass = re.getClassName();
            if (failedNode.equals(primaryAddr) && (UnregisteredDatanodeException.class.getName().equals(reClass)
                    || DisallowedDatanodeException.class.getName().equals(reClass)
                    || IncorrectVersionException.class.getName().equals(reClass))) {
                LOG.warn("Shut down this service: ", re);
                this.shouldServiceRun = false;
            } else {
                LOG.warn(re);
            }
        }

        private void register1() throws IOException {
            synchronized (avatarAddr1) {
                InjectionHandler.processEventIO(InjectionEvent.AVATARDATANODE_BEFORE_START_OFFERSERVICE1);
                if (avatarnode1 != null && namenode1 != null && !doneRegister1
                        && register(namenode1, nameAddr1, false)) {
                    InjectionHandler.processEvent(InjectionEvent.AVATARDATANODE_START_OFFERSERVICE1);
                    doneRegister1 = true;
                    offerService1 = new OfferService(AvatarDataNode.this, this, namenode1, nameAddr1, avatarnode1,
                            avatarAddr1);
                    of1 = new Thread(offerService1, "OfferService1 " + nameAddr1);
                    of1.start();
                }
            }
        }

        private void register2() throws IOException {
            synchronized (avatarAddr2) {
                InjectionHandler.processEventIO(InjectionEvent.AVATARDATANODE_BEFORE_START_OFFERSERVICE2);
                if (avatarnode2 != null && namenode2 != null && !doneRegister2
                        && register(namenode2, nameAddr2, false)) {
                    InjectionHandler.processEvent(InjectionEvent.AVATARDATANODE_START_OFFERSERVICE2);
                    doneRegister2 = true;
                    offerService2 = new OfferService(AvatarDataNode.this, this, namenode2, nameAddr2, avatarnode2,
                            avatarAddr2);
                    of2 = new Thread(offerService2, "OfferService2 " + nameAddr2);
                    of2.start();
                }
            }
        }

        @Override
        public void run() {
            LOG.info(nsRegistration + "In AvatarDataNode.run, data = " + data);

            try {
                // set up namespace
                try {
                    setupNS();
                } catch (IOException ioe) {
                    // Initial handshake, storage recovery or registration failed
                    LOG.fatal(nsRegistration + " initialization failed for namespaceId " + namespaceId
                            + " default addr: " + defaultAddr, ioe);
                    return;
                }

                while (shouldServiceRun && shouldRun) {
                    InetSocketAddress failedNode = null;
                    try {
                        // try handshaking with any namenode that we have not yet tried
                        handshake(false);

                        try {
                            failedNode = nameAddr1;
                            register1();
                            failedNode = nameAddr2;
                        } finally {
                            register2();
                        }

                        this.initialized = true;
                        startDistributedUpgradeIfNeeded();
                    } catch (RemoteException re) {
                        handleRegistrationError(re, failedNode);
                    } catch (IOException ioe) { //other io exception
                        LOG.warn("IOException: ", ioe);
                    }
                    if (shouldServiceRun && shouldRun) {
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException ie) {
                        }
                    }
                }
            } catch (Exception e) {
                LOG.warn("Exception: ", e);
            } finally {
                LOG.info(nsRegistration + ":Finishing AvatarDataNode in: " + data);
                stopServices();
                joinServices();
                cleanUp();
            }
        }

        /**
         * Notify both namenode(s) that we have received a block
         */
        @Override
        public void notifyNamenodeReceivedBlock(Block block, String delHint) {
            if (offerService1 != null) {
                offerService1.notifyNamenodeReceivedBlock(block, delHint);
            }
            if (offerService2 != null) {
                offerService2.notifyNamenodeReceivedBlock(block, delHint);
            }
        }

        /**
         * Notify both namenode(s) that we have deleted a block
         */
        @Override
        public void notifyNamenodeDeletedBlock(Block block) {
            if (offerService1 != null) {
                offerService1.notifyNamenodeDeletedBlock(block);
            }
            if (offerService2 != null) {
                offerService2.notifyNamenodeDeletedBlock(block);
            }
        }

        /**
         * Update received and retry list, when blocks are deleted
         */
        void removeReceivedBlocks(Block[] list) {
            if (offerService1 != null) {
                offerService1.removeReceivedBlocks(list);
            }
            if (offerService2 != null) {
                offerService2.removeReceivedBlocks(list);
            }
        }

        @Override
        public DatanodeRegistration getNsRegistration() {
            return nsRegistration;
        }

        @Override
        public DatanodeProtocol getDatanodeProtocol() {
            return this.primaryOfferService.namenode;
        }

        @Override
        public InetSocketAddress getNNSocketAddress() {
            return this.nameAddr1;
        }

        @Override
        public int getNamespaceId() {
            return this.namespaceId;
        }

        @Override
        public String getNameserviceId() {
            return this.nameserviceId;
        }

        @Override
        public boolean initialized() {
            return initialized;
        }

        @Override
        public boolean isAlive() {
            return shouldServiceRun && spThread.isAlive();
        }

        @Override
        public void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
            if (this.offerService1 != null)
                this.offerService1.reportBadBlocks(blocks);
            if (this.offerService2 != null)
                this.offerService2.reportBadBlocks(blocks);
        }

        @Override
        public LocatedBlock syncBlock(Block block, List<BlockRecord> syncList, boolean closeFile,
                List<InterDatanodeProtocol> datanodeProxies, long deadline) throws IOException {
            if (offerService1 != null && isPrimaryOfferService(offerService1))
                return offerService1.syncBlock(block, syncList, closeFile, datanodeProxies, deadline);
            if (offerService2 != null && isPrimaryOfferService(offerService2))
                return offerService2.syncBlock(block, syncList, closeFile, datanodeProxies, deadline);
            return null;
        }

        @Override
        public void scheduleBlockReport(long delay) {
            if (this.offerService1 != null)
                this.offerService1.scheduleBlockReport(delay);
            if (this.offerService2 != null)
                this.offerService2.scheduleBlockReport(delay);
        }

        // Only use for testing
        public void scheduleBlockReceivedAndDeleted(long delay) {
            if (this.offerService1 != null)
                this.offerService1.scheduleBlockReceivedAndDeleted(delay);
            if (this.offerService2 != null)
                this.offerService2.scheduleBlockReceivedAndDeleted(delay);
        }

    }

    /**
     * Tells the datanode to start the shutdown process.
     */
    public synchronized void shutdownDN() {
        shouldRun = false;
        if (namespaceManager != null) {
            namespaceManager.stopAll();
        }
    }

    public boolean shouldRun() {
        return shouldRun;
    }

    DataStorage getStorage() {
        return storage;
    }

    private static void printUsage() {
        System.err.println("Usage: java DataNode");
        System.err.println("           [-rollback]");
    }

    /**
     * Parse and verify command line arguments and set configuration parameters.
     *
     * @return false if passed argements are incorrect
     */
    private static boolean parseArguments(String args[], Configuration conf) {
        int argsLen = (args == null) ? 0 : args.length;
        StartupOption startOpt = StartupOption.REGULAR;
        for (int i = 0; i < argsLen; i++) {
            String cmd = args[i];
            if ("-r".equalsIgnoreCase(cmd) || "--rack".equalsIgnoreCase(cmd)) {
                LOG.error("-r, --rack arguments are not supported anymore. RackID "
                        + "resolution is handled by the NameNode.");
                System.exit(-1);
            } else if ("-rollback".equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.ROLLBACK;
            } else if ("-regular".equalsIgnoreCase(cmd)) {
                startOpt = StartupOption.REGULAR;
            } else
                return false;
        }
        setStartupOption(conf, startOpt);
        return true;
    }

    private static void setStartupOption(Configuration conf, StartupOption opt) {
        conf.set("dfs.datanode.startup", opt.toString());
    }

    /**
     * Returns the IP address of the namenode
     */
    static InetSocketAddress getNameNodeAddress(Configuration conf, String cname, String rpcKey, String cname2) {
        String fs = conf.get(cname);
        String fs1 = conf.get(rpcKey);
        String fs2 = conf.get(cname2);
        Configuration newconf = new Configuration(conf);
        newconf.set("fs.default.name", fs);
        if (fs1 != null) {
            newconf.set(DFS_NAMENODE_RPC_ADDRESS_KEY, fs1);
        }
        if (fs2 != null) {
            newconf.set("dfs.namenode.dn-address", fs2);
        }
        return DataNode.getNameNodeAddress(newconf);
    }

    /**
     * Returns the IP:port address of the avatar node
     */
    private static List<InetSocketAddress> getAvatarNodeAddresses(String suffix, Configuration conf,
            Collection<String> serviceIds) throws IOException {
        List<InetSocketAddress> namenodeAddresses = DFSUtil.getRPCAddresses(suffix, conf, serviceIds,
                FSConstants.DFS_NAMENODE_RPC_ADDRESS_KEY);
        List<InetSocketAddress> avatarnodeAddresses = new ArrayList<InetSocketAddress>(namenodeAddresses.size());
        for (InetSocketAddress namenodeAddress : namenodeAddresses) {
            avatarnodeAddresses.add(new InetSocketAddress(namenodeAddress.getAddress(),
                    conf.getInt("dfs.avatarnode.port", namenodeAddress.getPort() + 1)));
        }
        return avatarnodeAddresses;
    }

    public static AvatarDataNode makeInstance(String[] dataDirs, Configuration conf) throws IOException {
        ArrayList<File> dirs = new ArrayList<File>();
        for (int i = 0; i < dataDirs.length; i++) {
            File data = new File(dataDirs[i]);
            try {
                DiskChecker.checkDir(data);
                dirs.add(data);
            } catch (DiskErrorException e) {
                LOG.warn("Invalid directory in dfs.data.dir: " + e.getMessage());
            }
        }
        if (dirs.size() > 0) {
            String dnThreadName = "AvatarDataNode: [" + StringUtils.arrayToString(dataDirs) + "]";
            return new AvatarDataNode(conf, dirs, dnThreadName);
        }
        LOG.error("All directories in dfs.data.dir are invalid.");
        return null;
    }

    /** Instantiate a single datanode object. This must be run by invoking
     *  {@link DataNode#runDatanodeDaemon(DataNode)} subsequently. 
     */
    public static AvatarDataNode instantiateDataNode(String args[], Configuration conf) throws IOException {
        if (conf == null)
            conf = new Configuration();
        if (!parseArguments(args, conf)) {
            printUsage();
            return null;
        }
        if (conf.get("dfs.network.script") != null) {
            LOG.error("This configuration for rack identification is not supported"
                    + " anymore. RackID resolution is handled by the NameNode.");
            System.exit(-1);
        }
        String[] dataDirs = getListOfDataDirs(conf);
        return makeInstance(dataDirs, conf);
    }

    public static AvatarDataNode createDataNode(String args[], Configuration conf) throws IOException {
        AvatarDataNode dn = instantiateDataNode(args, conf);
        dn.runDatanodeDaemon();
        return dn;
    }

    @Override
    public void refreshNamenodes(Configuration conf) throws IOException {
        LOG.info("refresh namenodes");
        try {
            Collection<String> serviceIds = DFSUtil.getNameServiceIds(conf);
            List<InetSocketAddress> nameAddrs0 = DFSUtil.getRPCAddresses("0", conf, serviceIds,
                    DATANODE_PROTOCOL_ADDRESS, DFS_NAMENODE_RPC_ADDRESS_KEY);
            List<InetSocketAddress> nameAddrs1 = DFSUtil.getRPCAddresses("1", conf, serviceIds,
                    DATANODE_PROTOCOL_ADDRESS, DFS_NAMENODE_RPC_ADDRESS_KEY);
            List<InetSocketAddress> avatarAddrs0 = getAvatarNodeAddresses("0", conf, serviceIds);
            List<InetSocketAddress> avatarAddrs1 = getAvatarNodeAddresses("1", conf, serviceIds);
            List<String> defaultAddresses = getZnodePaths(serviceIds, conf);
            ((AvatarNamespaceManager) namespaceManager).refreshNamenodes(nameAddrs0, nameAddrs1, avatarAddrs0,
                    avatarAddrs1, defaultAddresses, serviceIds);
        } catch (InterruptedException e) {
            throw new IOException(e.getCause());
        }
    }

    private static List<String> getZnodePaths(Collection<String> serviceIds, Configuration conf) {
        List<String> datanodeProtocolZnodePaths = new ArrayList<String>(Math.max(serviceIds.size(), 1));
        if (serviceIds.isEmpty()) {
            datanodeProtocolZnodePaths.add(conf.get(NameNode.DATANODE_PROTOCOL_ADDRESS));
        } else {
            for (String service : serviceIds) {
                datanodeProtocolZnodePaths.add(conf.get(NameNode.DATANODE_PROTOCOL_ADDRESS + "." + service));
            }
        }
        return datanodeProtocolZnodePaths;
    }

    @Override
    public void refreshOfferService(String serviceNameToRefresh) throws IOException {
        Configuration currentConf = new Configuration();
        Collection<String> serviceIds = DFSUtil.getNameServiceIds(currentConf);
        String[] allServices = serviceIds.toArray(new String[] {});
        for (int i = 0; i < allServices.length; i++) {
            if (((String) allServices[i]).equalsIgnoreCase(serviceNameToRefresh)) {
                List<InetSocketAddress> nameAddrs0 = DFSUtil.getRPCAddresses("0", currentConf, serviceIds,
                        DATANODE_PROTOCOL_ADDRESS, DFS_NAMENODE_RPC_ADDRESS_KEY);
                List<InetSocketAddress> nameAddrs1 = DFSUtil.getRPCAddresses("1", currentConf, serviceIds,
                        DATANODE_PROTOCOL_ADDRESS, DFS_NAMENODE_RPC_ADDRESS_KEY);
                List<InetSocketAddress> avatarAddrs0 = getAvatarNodeAddresses("0", currentConf, serviceIds);
                List<InetSocketAddress> avatarAddrs1 = getAvatarNodeAddresses("1", currentConf, serviceIds);
                ((AvatarNamespaceManager) namespaceManager).refreshOfferService(nameAddrs0.get(i),
                        nameAddrs1.get(i), avatarAddrs0.get(i), avatarAddrs1.get(i), serviceNameToRefresh);
                return;
            }
        }
        throw new IOException("Service name  (=" + serviceNameToRefresh + ") not found. ");
    }

    public static void main(String argv[]) {
        org.apache.hadoop.hdfs.DnsMonitorSecurityManager.setTheManager();
        try {
            StringUtils.startupShutdownMessage(AvatarDataNode.class, argv, LOG);
            FastWritableHDFS.init();
            FastProtocolHDFS.init();
            AvatarDataNode avatarnode = createDataNode(argv, null);
            if (avatarnode != null) {
                avatarnode.waitAndShutdown();
            }
        } catch (Throwable e) {
            LOG.error(StringUtils.stringifyException(e));
            System.exit(-1);
        }
    }
}