org.apache.hadoop.hdfs.client.DfsClientShmManager.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.client.DfsClientShmManager.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.client;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.ExtendedBlockId;
import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
import org.apache.hadoop.hdfs.net.DomainPeer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.DomainSocketWatcher;
import org.apache.hadoop.classification.InterfaceAudience;

/**
 * Manages short-circuit memory segments for an HDFS client.
 * 
 * Clients are responsible for requesting and releasing shared memory segments used
 * for communicating with the DataNode. The client will try to allocate new slots
 * in the set of existing segments, falling back to getting a new segment from the
 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
 * 
 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
 * See {@link ShortCircuitRegistry} for more information on the communication protocol.
 */
@InterfaceAudience.Private
public class DfsClientShmManager implements Closeable {
    private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);

    /**
     * Manages short-circuit memory segments that pertain to a given DataNode.
     */
    class EndpointShmManager {
        /**
         * The datanode we're managing.
         */
        private final DatanodeInfo datanode;

        /**
         * Shared memory segments which have no empty slots.
         *
         * Protected by the manager lock.
         */
        private final TreeMap<ShmId, DfsClientShm> full = new TreeMap<ShmId, DfsClientShm>();

        /**
         * Shared memory segments which have at least one empty slot.
         *
         * Protected by the manager lock.
         */
        private final TreeMap<ShmId, DfsClientShm> notFull = new TreeMap<ShmId, DfsClientShm>();

        /**
         * True if this datanode doesn't support short-circuit shared memory
         * segments.
         *
         * Protected by the manager lock.
         */
        private boolean disabled = false;

        /**
         * True if we're in the process of loading a shared memory segment from
         * this DataNode.
         *
         * Protected by the manager lock.
         */
        private boolean loading = false;

        EndpointShmManager(DatanodeInfo datanode) {
            this.datanode = datanode;
        }

        /**
         * Pull a slot out of a preexisting shared memory segment.
         *
         * Must be called with the manager lock held.
         *
         * @param blockId     The blockId to put inside the Slot object.
         *
         * @return            null if none of our shared memory segments contain a
         *                      free slot; the slot object otherwise.
         */
        private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
            if (notFull.isEmpty()) {
                return null;
            }
            Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
            DfsClientShm shm = entry.getValue();
            ShmId shmId = shm.getShmId();
            Slot slot = shm.allocAndRegisterSlot(blockId);
            if (shm.isFull()) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() + " out of " + shm);
                }
                DfsClientShm removedShm = notFull.remove(shmId);
                Preconditions.checkState(removedShm == shm);
                full.put(shmId, shm);
            } else {
                if (LOG.isTraceEnabled()) {
                    LOG.trace(this + ": pulled slot " + slot.getSlotIdx() + " out of " + shm);
                }
            }
            return slot;
        }

        /**
         * Ask the DataNode for a new shared memory segment.  This function must be
         * called with the manager lock held.  We will release the lock while
         * communicating with the DataNode.
         *
         * @param clientName    The current client name.
         * @param peer          The peer to use to talk to the DataNode.
         *
         * @return              Null if the DataNode does not support shared memory
         *                        segments, or experienced an error creating the
         *                        shm.  The shared memory segment itself on success.
         * @throws IOException  If there was an error communicating over the socket.
         *                        We will not throw an IOException unless the socket
         *                        itself (or the network) is the problem.
         */
        private DfsClientShm requestNewShm(String clientName, DomainPeer peer) throws IOException {
            final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
            new Sender(out).requestShortCircuitShm(clientName);
            ShortCircuitShmResponseProto resp = ShortCircuitShmResponseProto
                    .parseFrom(PBHelper.vintPrefixed(peer.getInputStream()));
            String error = resp.hasError() ? resp.getError() : "(unknown)";
            switch (resp.getStatus()) {
            case SUCCESS:
                DomainSocket sock = peer.getDomainSocket();
                byte buf[] = new byte[1];
                FileInputStream fis[] = new FileInputStream[1];
                if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
                    throw new EOFException("got EOF while trying to transfer the "
                            + "file descriptor for the shared memory segment.");
                }
                if (fis[0] == null) {
                    throw new IOException("the datanode " + datanode + " failed to "
                            + "pass a file descriptor for the shared memory segment.");
                }
                try {
                    DfsClientShm shm = new DfsClientShm(PBHelper.convert(resp.getId()), fis[0], this, peer);
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(this + ": createNewShm: created " + shm);
                    }
                    return shm;
                } finally {
                    IOUtils.cleanup(LOG, fis[0]);
                }
            case ERROR_UNSUPPORTED:
                // The DataNode just does not support short-circuit shared memory
                // access, and we should stop asking.
                LOG.info(this + ": datanode does not support short-circuit " + "shared memory access: " + error);
                disabled = true;
                return null;
            default:
                // The datanode experienced some kind of unexpected error when trying to
                // create the short-circuit shared memory segment.
                LOG.warn(this + ": error requesting short-circuit shared memory " + "access: " + error);
                return null;
            }
        }

        /**
         * Allocate a new shared memory slot connected to this datanode.
         *
         * Must be called with the EndpointShmManager lock held.
         *
         * @param peer          The peer to use to talk to the DataNode.
         * @param clientName    The client name.
         * @param usedPeer      (out param) Will be set to true if we used the peer.
         *                        When a peer is used
         *
         * @return              null if the DataNode does not support shared memory
         *                        segments, or experienced an error creating the
         *                        shm.  The shared memory segment itself on success.
         * @throws IOException  If there was an error communicating over the socket.
         */
        Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer, String clientName, ExtendedBlockId blockId)
                throws IOException {
            while (true) {
                if (closed) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(this + ": the DfsClientShmManager has been closed.");
                    }
                    return null;
                }
                if (disabled) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(this + ": shared memory segment access is disabled.");
                    }
                    return null;
                }
                // Try to use an existing slot.
                Slot slot = allocSlotFromExistingShm(blockId);
                if (slot != null) {
                    return slot;
                }
                // There are no free slots.  If someone is loading more slots, wait
                // for that to finish.
                if (loading) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(this + ": waiting for loading to finish...");
                    }
                    finishedLoading.awaitUninterruptibly();
                } else {
                    // Otherwise, load the slot ourselves.
                    loading = true;
                    lock.unlock();
                    DfsClientShm shm;
                    try {
                        shm = requestNewShm(clientName, peer);
                        if (shm == null)
                            continue;
                        // See #{DfsClientShmManager#domainSocketWatcher} for details
                        // about why we do this before retaking the manager lock.
                        domainSocketWatcher.add(peer.getDomainSocket(), shm);
                        // The DomainPeer is now our responsibility, and should not be
                        // closed by the caller.
                        usedPeer.setValue(true);
                    } finally {
                        lock.lock();
                        loading = false;
                        finishedLoading.signalAll();
                    }
                    if (shm.isStale()) {
                        // If the peer closed immediately after the shared memory segment
                        // was created, the DomainSocketWatcher callback might already have
                        // fired and marked the shm as stale.  In this case, we obviously
                        // don't want to add the SharedMemorySegment to our list of valid
                        // not-full segments.
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(this + ": the UNIX domain socket associated with "
                                    + "this short-circuit memory closed before we could make " + "use of the shm.");
                        }
                    } else {
                        notFull.put(shm.getShmId(), shm);
                    }
                }
            }
        }

        /**
         * Stop tracking a slot.
         *
         * Must be called with the EndpointShmManager lock held.
         *
         * @param slot          The slot to release.
         */
        void freeSlot(Slot slot) {
            DfsClientShm shm = (DfsClientShm) slot.getShm();
            shm.unregisterSlot(slot.getSlotIdx());
            if (shm.isStale()) {
                // Stale shared memory segments should not be tracked here.
                Preconditions.checkState(!full.containsKey(shm.getShmId()));
                Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
                if (shm.isEmpty()) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(this + ": freeing empty stale " + shm);
                    }
                    shm.free();
                }
            } else {
                ShmId shmId = shm.getShmId();
                full.remove(shmId); // The shm can't be full if we just freed a slot.
                if (shm.isEmpty()) {
                    notFull.remove(shmId);

                    // If the shared memory segment is now empty, we call shutdown(2) on
                    // the UNIX domain socket associated with it.  The DomainSocketWatcher,
                    // which is watching this socket, will call DfsClientShm#handle,
                    // cleaning up this shared memory segment.
                    //
                    // See #{DfsClientShmManager#domainSocketWatcher} for details about why
                    // we don't want to call DomainSocketWatcher#remove directly here.
                    //
                    // Note that we could experience 'fragmentation' here, where the
                    // DFSClient allocates a bunch of slots in different shared memory
                    // segments, and then frees most of them, but never fully empties out
                    // any segment.  We make some attempt to avoid this fragmentation by
                    // always allocating new slots out of the shared memory segment with the
                    // lowest ID, but it could still occur.  In most workloads,
                    // fragmentation should not be a major concern, since it doesn't impact
                    // peak file descriptor usage or the speed of allocation.
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(this + ": shutting down UNIX domain socket for " + "empty " + shm);
                    }
                    shutdown(shm);
                } else {
                    notFull.put(shmId, shm);
                }
            }
        }

        /**
         * Unregister a shared memory segment.
         *
         * Once a segment is unregistered, we will not allocate any more slots
         * inside that segment.
         *
         * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
         * lock.
         *
         * @param shmId         The ID of the shared memory segment to unregister.
         */
        void unregisterShm(ShmId shmId) {
            lock.lock();
            try {
                full.remove(shmId);
                notFull.remove(shmId);
            } finally {
                lock.unlock();
            }
        }

        @Override
        public String toString() {
            return String.format("EndpointShmManager(%s, parent=%s)", datanode, DfsClientShmManager.this);
        }

        PerDatanodeVisitorInfo getVisitorInfo() {
            return new PerDatanodeVisitorInfo(full, notFull, disabled);
        }

        final void shutdown(DfsClientShm shm) {
            try {
                shm.getPeer().getDomainSocket().shutdown();
            } catch (IOException e) {
                LOG.warn(this + ": error shutting down shm: got IOException calling " + "shutdown(SHUT_RDWR)", e);
            }
        }
    }

    private boolean closed = false;

    private final ReentrantLock lock = new ReentrantLock();

    /**
     * A condition variable which is signalled when we finish loading a segment
     * from the Datanode.
     */
    private final Condition finishedLoading = lock.newCondition();

    /**
     * Information about each Datanode.
     */
    private final HashMap<DatanodeInfo, EndpointShmManager> datanodes = new HashMap<DatanodeInfo, EndpointShmManager>(
            1);

    /**
     * The DomainSocketWatcher which keeps track of the UNIX domain socket
     * associated with each shared memory segment.
     *
     * Note: because the DomainSocketWatcher makes callbacks into this
     * DfsClientShmManager object, you must MUST NOT attempt to take the
     * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
     * or else deadlock might result.   This means that most DomainSocketWatcher
     * methods are off-limits unless you release the manager lock first.
     */
    private final DomainSocketWatcher domainSocketWatcher;

    DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
        this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
    }

    public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer, MutableBoolean usedPeer, ExtendedBlockId blockId,
            String clientName) throws IOException {
        lock.lock();
        try {
            if (closed) {
                LOG.trace(this + ": the DfsClientShmManager isclosed.");
                return null;
            }
            EndpointShmManager shmManager = datanodes.get(datanode);
            if (shmManager == null) {
                shmManager = new EndpointShmManager(datanode);
                datanodes.put(datanode, shmManager);
            }
            return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
        } finally {
            lock.unlock();
        }
    }

    public void freeSlot(Slot slot) {
        lock.lock();
        try {
            DfsClientShm shm = (DfsClientShm) slot.getShm();
            shm.getEndpointShmManager().freeSlot(slot);
        } finally {
            lock.unlock();
        }
    }

    @VisibleForTesting
    public static class PerDatanodeVisitorInfo {
        public final TreeMap<ShmId, DfsClientShm> full;
        public final TreeMap<ShmId, DfsClientShm> notFull;
        public final boolean disabled;

        PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full, TreeMap<ShmId, DfsClientShm> notFull,
                boolean disabled) {
            this.full = full;
            this.notFull = notFull;
            this.disabled = disabled;
        }
    }

    @VisibleForTesting
    public interface Visitor {
        void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info) throws IOException;
    }

    @VisibleForTesting
    public void visit(Visitor visitor) throws IOException {
        lock.lock();
        try {
            HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
            for (Entry<DatanodeInfo, EndpointShmManager> entry : datanodes.entrySet()) {
                info.put(entry.getKey(), entry.getValue().getVisitorInfo());
            }
            visitor.visit(info);
        } finally {
            lock.unlock();
        }
    }

    /**
     * Close the DfsClientShmManager.
     */
    @Override
    public void close() throws IOException {
        lock.lock();
        try {
            if (closed)
                return;
            closed = true;
        } finally {
            lock.unlock();
        }
        // When closed, the domainSocketWatcher will issue callbacks that mark
        // all the outstanding DfsClientShm segments as stale.
        IOUtils.cleanup(LOG, domainSocketWatcher);
    }

    @Override
    public String toString() {
        return String.format("ShortCircuitShmManager(%08x)", System.identityHashCode(this));
    }

    @VisibleForTesting
    public DomainSocketWatcher getDomainSocketWatcher() {
        return domainSocketWatcher;
    }
}