org.apache.hadoop.hbase.replication.ReplicationQueuesZKImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hbase.replication.ReplicationQueuesZKImpl.java

Source

/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.replication;

import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp;
import org.apache.zookeeper.KeeperException;

/**
 * This class provides an implementation of the ReplicationQueues interface using Zookeeper. The
 * base znode that this class works at is the myQueuesZnode. The myQueuesZnode contains a list of
 * all outstanding HLog files on this region server that need to be replicated. The myQueuesZnode is
 * the regionserver name (a concatenation of the region servers hostname, client port and start
 * code). For example:
 *
 * /hbase/replication/rs/hostname.example.org,6020,1234
 *
 * Within this znode, the region server maintains a set of HLog replication queues. These queues are
 * represented by child znodes named using there give queue id. For example:
 *
 * /hbase/replication/rs/hostname.example.org,6020,1234/1
 * /hbase/replication/rs/hostname.example.org,6020,1234/2
 *
 * Each queue has one child znode for every HLog that still needs to be replicated. The value of
 * these HLog child znodes is the latest position that has been replicated. This position is updated
 * every time a HLog entry is replicated. For example:
 *
 * /hbase/replication/rs/hostname.example.org,6020,1234/1/23522342.23422 [VALUE: 254]
 */
public class ReplicationQueuesZKImpl extends ReplicationStateZKBase implements ReplicationQueues {

    /** Znode containing all replication queues for this region server. */
    private String myQueuesZnode;
    /** Name of znode we use to lock during failover */
    private final static String RS_LOCK_ZNODE = "lock";

    private static final Log LOG = LogFactory.getLog(ReplicationQueuesZKImpl.class);

    public ReplicationQueuesZKImpl(final ZooKeeperWatcher zk, Configuration conf, Abortable abortable) {
        super(zk, conf, abortable);
    }

    @Override
    public void init(String serverName) throws ReplicationException {
        this.myQueuesZnode = ZKUtil.joinZNode(this.queuesZNode, serverName);
        try {
            ZKUtil.createWithParents(this.zookeeper, this.myQueuesZnode);
        } catch (KeeperException e) {
            throw new ReplicationException("Could not initialize replication queues.", e);
        }
    }

    @Override
    public void removeQueue(String queueId) {
        try {
            ZKUtil.deleteNodeRecursively(this.zookeeper, ZKUtil.joinZNode(this.myQueuesZnode, queueId));
        } catch (KeeperException e) {
            this.abortable.abort("Failed to delete queue (queueId=" + queueId + ")", e);
        }
    }

    @Override
    public void addLog(String queueId, String filename) throws ReplicationException {
        String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
        znode = ZKUtil.joinZNode(znode, filename);
        try {
            ZKUtil.createWithParents(this.zookeeper, znode);
        } catch (KeeperException e) {
            throw new ReplicationException("Could not add log because znode could not be created. queueId="
                    + queueId + ", filename=" + filename);
        }
    }

    @Override
    public void removeLog(String queueId, String filename) {
        try {
            String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
            znode = ZKUtil.joinZNode(znode, filename);
            ZKUtil.deleteNode(this.zookeeper, znode);
        } catch (KeeperException e) {
            this.abortable.abort(
                    "Failed to remove hlog from queue (queueId=" + queueId + ", filename=" + filename + ")", e);
        }
    }

    @Override
    public void setLogPosition(String queueId, String filename, long position) {
        try {
            String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
            znode = ZKUtil.joinZNode(znode, filename);
            // Why serialize String of Long and not Long as bytes?
            ZKUtil.setData(this.zookeeper, znode, ZKUtil.positionToByteArray(position));
        } catch (KeeperException e) {
            this.abortable.abort("Failed to write replication hlog position (filename=" + filename + ", position="
                    + position + ")", e);
        }
    }

    @Override
    public long getLogPosition(String queueId, String filename) throws ReplicationException {
        String clusterZnode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
        String znode = ZKUtil.joinZNode(clusterZnode, filename);
        byte[] bytes = null;
        try {
            bytes = ZKUtil.getData(this.zookeeper, znode);
        } catch (KeeperException e) {
            throw new ReplicationException("Internal Error: could not get position in log for queueId=" + queueId
                    + ", filename=" + filename, e);
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            return 0;
        }
        try {
            return ZKUtil.parseHLogPositionFrom(bytes);
        } catch (DeserializationException de) {
            LOG.warn("Failed to parse HLogPosition for queueId=" + queueId + " and hlog=" + filename
                    + "znode content, continuing.");
        }
        // if we can not parse the position, start at the beginning of the hlog file
        // again
        return 0;
    }

    @Override
    public boolean isThisOurZnode(String znode) {
        return ZKUtil.joinZNode(this.queuesZNode, znode).equals(this.myQueuesZnode);
    }

    @Override
    public SortedMap<String, SortedSet<String>> claimQueues(String regionserverZnode) {
        SortedMap<String, SortedSet<String>> newQueues = new TreeMap<String, SortedSet<String>>();
        // check whether there is multi support. If yes, use it.
        if (conf.getBoolean(HConstants.ZOOKEEPER_USEMULTI, true)) {
            LOG.info("Atomically moving " + regionserverZnode + "'s hlogs to my queue");
            newQueues = copyQueuesFromRSUsingMulti(regionserverZnode);
        } else {
            LOG.info("Moving " + regionserverZnode + "'s hlogs to my queue");
            if (!lockOtherRS(regionserverZnode)) {
                return newQueues;
            }
            newQueues = copyQueuesFromRS(regionserverZnode);
            deleteAnotherRSQueues(regionserverZnode);
        }
        return newQueues;
    }

    @Override
    public void removeAllQueues() {
        try {
            ZKUtil.deleteNodeRecursively(this.zookeeper, this.myQueuesZnode);
        } catch (KeeperException e) {
            // if the znode is already expired, don't bother going further
            if (e instanceof KeeperException.SessionExpiredException) {
                return;
            }
            this.abortable.abort("Failed to delete replication queues for region server: " + this.myQueuesZnode, e);
        }
    }

    @Override
    public List<String> getLogsInQueue(String queueId) {
        String znode = ZKUtil.joinZNode(this.myQueuesZnode, queueId);
        List<String> result = null;
        try {
            result = ZKUtil.listChildrenNoWatch(this.zookeeper, znode);
        } catch (KeeperException e) {
            this.abortable.abort("Failed to get list of hlogs for queueId=" + queueId, e);
        }
        return result;
    }

    @Override
    public List<String> getAllQueues() {
        List<String> listOfQueues = null;
        try {
            listOfQueues = ZKUtil.listChildrenNoWatch(this.zookeeper, this.myQueuesZnode);
        } catch (KeeperException e) {
            this.abortable.abort("Failed to get a list of queues for region server: " + this.myQueuesZnode, e);
        }
        return listOfQueues;
    }

    /**
     * Try to set a lock in another region server's znode.
     * @param znode the server names of the other server
     * @return true if the lock was acquired, false in every other cases
     */
    private boolean lockOtherRS(String znode) {
        try {
            String parent = ZKUtil.joinZNode(this.queuesZNode, znode);
            if (parent.equals(this.myQueuesZnode)) {
                LOG.warn("Won't lock because this is us, we're dead!");
                return false;
            }
            String p = ZKUtil.joinZNode(parent, RS_LOCK_ZNODE);
            ZKUtil.createAndWatch(this.zookeeper, p, lockToByteArray(this.myQueuesZnode));
        } catch (KeeperException e) {
            // This exception will pop up if the znode under which we're trying to
            // create the lock is already deleted by another region server, meaning
            // that the transfer already occurred.
            // NoNode => transfer is done and znodes are already deleted
            // NodeExists => lock znode already created by another RS
            if (e instanceof KeeperException.NoNodeException || e instanceof KeeperException.NodeExistsException) {
                LOG.info("Won't transfer the queue," + " another RS took care of it because of: " + e.getMessage());
            } else {
                LOG.info("Failed lock other rs", e);
            }
            return false;
        }
        return true;
    }

    /**
     * Delete all the replication queues for a given region server.
     * @param regionserverZnode The znode of the region server to delete.
     */
    private void deleteAnotherRSQueues(String regionserverZnode) {
        String fullpath = ZKUtil.joinZNode(this.queuesZNode, regionserverZnode);
        try {
            List<String> clusters = ZKUtil.listChildrenNoWatch(this.zookeeper, fullpath);
            for (String cluster : clusters) {
                // No need to delete, it will be deleted later.
                if (cluster.equals(RS_LOCK_ZNODE)) {
                    continue;
                }
                String fullClusterPath = ZKUtil.joinZNode(fullpath, cluster);
                ZKUtil.deleteNodeRecursively(this.zookeeper, fullClusterPath);
            }
            // Finish cleaning up
            ZKUtil.deleteNodeRecursively(this.zookeeper, fullpath);
        } catch (KeeperException e) {
            if (e instanceof KeeperException.NoNodeException || e instanceof KeeperException.NotEmptyException) {
                // Testing a special case where another region server was able to
                // create a lock just after we deleted it, but then was also able to
                // delete the RS znode before us or its lock znode is still there.
                if (e.getPath().equals(fullpath)) {
                    return;
                }
            }
            this.abortable.abort("Failed to delete replication queues for region server: " + regionserverZnode, e);
        }
    }

    /**
     * It "atomically" copies all the hlogs queues from another region server and returns them all
     * sorted per peer cluster (appended with the dead server's znode).
     * @param znode pertaining to the region server to copy the queues from
     * @return HLog queues sorted per peer cluster
     */
    private SortedMap<String, SortedSet<String>> copyQueuesFromRSUsingMulti(String znode) {
        SortedMap<String, SortedSet<String>> queues = new TreeMap<String, SortedSet<String>>();
        // hbase/replication/rs/deadrs
        String deadRSZnodePath = ZKUtil.joinZNode(this.queuesZNode, znode);
        List<String> peerIdsToProcess = null;
        List<ZKUtilOp> listOfOps = new ArrayList<ZKUtil.ZKUtilOp>();
        try {
            peerIdsToProcess = ZKUtil.listChildrenNoWatch(this.zookeeper, deadRSZnodePath);
            if (peerIdsToProcess == null)
                return queues; // node already processed
            for (String peerId : peerIdsToProcess) {
                ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(peerId);
                if (!peerExists(replicationQueueInfo.getPeerId())) {
                    LOG.warn("Peer " + peerId + " didn't exist, skipping the replay");
                    // Protection against moving orphaned queues
                    continue;
                }
                String newPeerId = peerId + "-" + znode;
                String newPeerZnode = ZKUtil.joinZNode(this.myQueuesZnode, newPeerId);
                // check the logs queue for the old peer cluster
                String oldClusterZnode = ZKUtil.joinZNode(deadRSZnodePath, peerId);
                List<String> hlogs = ZKUtil.listChildrenNoWatch(this.zookeeper, oldClusterZnode);
                if (hlogs == null || hlogs.size() == 0) {
                    listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
                    continue; // empty log queue.
                }
                // create the new cluster znode
                SortedSet<String> logQueue = new TreeSet<String>();
                queues.put(newPeerId, logQueue);
                ZKUtilOp op = ZKUtilOp.createAndFailSilent(newPeerZnode, HConstants.EMPTY_BYTE_ARRAY);
                listOfOps.add(op);
                // get the offset of the logs and set it to new znodes
                for (String hlog : hlogs) {
                    String oldHlogZnode = ZKUtil.joinZNode(oldClusterZnode, hlog);
                    byte[] logOffset = ZKUtil.getData(this.zookeeper, oldHlogZnode);
                    LOG.debug("Creating " + hlog + " with data " + Bytes.toString(logOffset));
                    String newLogZnode = ZKUtil.joinZNode(newPeerZnode, hlog);
                    listOfOps.add(ZKUtilOp.createAndFailSilent(newLogZnode, logOffset));
                    // add ops for deleting
                    listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldHlogZnode));
                    logQueue.add(hlog);
                }
                // add delete op for peer
                listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldClusterZnode));
            }
            // add delete op for dead rs
            listOfOps.add(ZKUtilOp.deleteNodeFailSilent(deadRSZnodePath));
            LOG.debug(" The multi list size is: " + listOfOps.size());
            ZKUtil.multiOrSequential(this.zookeeper, listOfOps, false);
            LOG.info("Atomically moved the dead regionserver logs. ");
        } catch (KeeperException e) {
            // Multi call failed; it looks like some other regionserver took away the logs.
            LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
            queues.clear();
        } catch (InterruptedException e) {
            LOG.warn("Got exception in copyQueuesFromRSUsingMulti: ", e);
            queues.clear();
            Thread.currentThread().interrupt();
        }
        return queues;
    }

    /**
     * This methods copies all the hlogs queues from another region server and returns them all sorted
     * per peer cluster (appended with the dead server's znode)
     * @param znode server names to copy
     * @return all hlogs for all peers of that cluster, null if an error occurred
     */
    private SortedMap<String, SortedSet<String>> copyQueuesFromRS(String znode) {
        // TODO this method isn't atomic enough, we could start copying and then
        // TODO fail for some reason and we would end up with znodes we don't want.
        SortedMap<String, SortedSet<String>> queues = new TreeMap<String, SortedSet<String>>();
        try {
            String nodePath = ZKUtil.joinZNode(this.queuesZNode, znode);
            List<String> clusters = ZKUtil.listChildrenNoWatch(this.zookeeper, nodePath);
            // We have a lock znode in there, it will count as one.
            if (clusters == null || clusters.size() <= 1) {
                return queues;
            }
            // The lock isn't a peer cluster, remove it
            clusters.remove(RS_LOCK_ZNODE);
            for (String cluster : clusters) {
                ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(cluster);
                if (!peerExists(replicationQueueInfo.getPeerId())) {
                    LOG.warn("Peer " + cluster + " didn't exist, skipping the replay");
                    // Protection against moving orphaned queues
                    continue;
                }
                // We add the name of the recovered RS to the new znode, we can even
                // do that for queues that were recovered 10 times giving a znode like
                // number-startcode-number-otherstartcode-number-anotherstartcode-etc
                String newCluster = cluster + "-" + znode;
                String newClusterZnode = ZKUtil.joinZNode(this.myQueuesZnode, newCluster);
                String clusterPath = ZKUtil.joinZNode(nodePath, cluster);
                List<String> hlogs = ZKUtil.listChildrenNoWatch(this.zookeeper, clusterPath);
                // That region server didn't have anything to replicate for this cluster
                if (hlogs == null || hlogs.size() == 0) {
                    continue;
                }
                ZKUtil.createNodeIfNotExistsAndWatch(this.zookeeper, newClusterZnode, HConstants.EMPTY_BYTE_ARRAY);
                SortedSet<String> logQueue = new TreeSet<String>();
                queues.put(newCluster, logQueue);
                for (String hlog : hlogs) {
                    String z = ZKUtil.joinZNode(clusterPath, hlog);
                    byte[] positionBytes = ZKUtil.getData(this.zookeeper, z);
                    long position = 0;
                    try {
                        position = ZKUtil.parseHLogPositionFrom(positionBytes);
                    } catch (DeserializationException e) {
                        LOG.warn("Failed parse of hlog position from the following znode: " + z + ", Exception: "
                                + e);
                    }
                    LOG.debug("Creating " + hlog + " with data " + position);
                    String child = ZKUtil.joinZNode(newClusterZnode, hlog);
                    // Position doesn't actually change, we are just deserializing it for
                    // logging, so just use the already serialized version
                    ZKUtil.createAndWatch(this.zookeeper, child, positionBytes);
                    logQueue.add(hlog);
                }
            }
        } catch (KeeperException e) {
            this.abortable.abort("Copy queues from rs", e);
        } catch (InterruptedException e) {
            LOG.warn(e);
            Thread.currentThread().interrupt();
        }
        return queues;
    }

    /**
     * @param lockOwner
     * @return Serialized protobuf of <code>lockOwner</code> with pb magic prefix prepended suitable
     *         for use as content of an replication lock during region server fail over.
     */
    static byte[] lockToByteArray(final String lockOwner) {
        byte[] bytes = ZooKeeperProtos.ReplicationLock.newBuilder().setLockOwner(lockOwner).build().toByteArray();
        return ProtobufUtil.prependPBMagic(bytes);
    }
}