org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalManager.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalManager.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode.bookkeeper;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import org.apache.bookkeeper.client.BKException;
import org.apache.bookkeeper.client.BookKeeper;
import org.apache.bookkeeper.client.LedgerHandle;
import org.apache.bookkeeper.conf.ClientConfiguration;
import org.apache.bookkeeper.util.ZkUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader;
import org.apache.hadoop.hdfs.server.namenode.JournalManager;
import org.apache.hadoop.hdfs.server.namenode.RemoteStorageState;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.BookKeeperJournalMetadataManager;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.CurrentInProgressMetadata;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.EditLogLedgerMetadata;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.MaxTxId;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.Versioned;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.proto.FormatInfoWritable;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.proto.WritableUtil;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.BasicZooKeeper;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.ConnectionWatcher;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.RecoveringZooKeeper;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.Code;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.ZooKeeper;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;

import static org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalConfigKeys.*;
import static org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.ZkUtil.*;
import static org.apache.zookeeper.AsyncCallback.*;

/**
 * BookKeeper-based JournalManager implementation. This is inspired by
 * Apache's BookKeeperJournalManager, with several core differences:
 * interaction with ZooKeeper goes through {@link RecoveringZooKeeper},
 * custom {@link BookKeeperEditLogInputStream} implementation is used that
 * permits tailing in-progress edits and re-positioning within an ledger-based
 * output stream, and a custom {@link BookKeeperEditLogOutputStream} is used
 * that uses double buffer as used by the standard file journal manager
 * implementation.
 */
public class BookKeeperJournalManager implements JournalManager, LedgerHandleProvider {

    private static final Log LOG = LogFactory.getLog(BookKeeperJournalManager.class);

    // Version of the protocol used for serializing and de-serializing data in
    // znodes (i.e., the Writables)

    static final int PROTO_VERSION = -1;

    private final Configuration conf; // Configuration

    private final int quorumSize; // BookKeeper quorum size
    private final int ensembleSize; // BookKeeper cluster size
    private final BookKeeper bookKeeperClient; // BookKeeper client
    private final RecoveringZooKeeper zk;

    private final String digestPw; // BookKeeper digest password
    @VisibleForTesting
    protected final String zkParentPath; // Parent ZNode
    @VisibleForTesting
    protected final String formatInfoPath; // ZNode holding format/namespace information

    // Handles ledger metadata
    final BookKeeperJournalMetadataManager metadataManager;
    private final MaxTxId maxTxId; // stores max txid

    private final CurrentInProgressMetadata currentInProgressMetadata;

    private boolean initialized = false;
    private LedgerHandle currentInProgressLedger = null; // Current ledger

    @VisibleForTesting
    volatile String currentInProgressPath;

    private volatile NameNodeMetrics metrics = null;

    private long maxSeenTxId = -1;

    private static final ThreadLocal<FormatInfoWritable> localFormatInfoWritable = new ThreadLocal<FormatInfoWritable>() {
        @Override
        protected FormatInfoWritable initialValue() {
            return new FormatInfoWritable();
        }
    };

    public BookKeeperJournalManager(Configuration conf, URI uri, NamespaceInfo nsInfo, NameNodeMetrics metrics)
            throws IOException {
        this.conf = conf;
        this.metrics = metrics;
        quorumSize = conf.getInt(BKJM_BOOKKEEPER_QUORUM_SIZE, BKJM_BOOKKEEPER_QUORUM_SIZE_DEFAULT);
        ensembleSize = conf.getInt(BKJM_BOOKKEEPER_ENSEMBLE_SIZE, BKJM_BOOKKEEPER_ENSEMBLE_SIZE_DEFAULT);
        digestPw = conf.get(BKJM_BOOKKEEPER_DIGEST_PW, BKJM_BOOKKEEPER_DIGEST_PW_DEFAULT);
        String zkConnect = uri.getAuthority().replace(";", ",");
        zkParentPath = uri.getPath();
        String ledgersAvailablePath = conf.get(BKJM_ZK_LEDGERS_AVAILABLE_PATH,
                BKJM_ZK_LEDGERS_AVAILABLE_PATH_DEFAULT);
        formatInfoPath = joinPath(zkParentPath, "version");
        String currentInProgressPath = joinPath(zkParentPath, "CurrentInProgress");
        String maxTxIdPath = joinPath(zkParentPath, "maxtxid");
        int zkSessionTimeoutMs = conf.getInt(BKJM_ZK_SESSION_TIMEOUT, BKJM_ZK_SESSION_TIMEOUT_DEFAULT);
        int zkMaxRetries = conf.getInt(BKJM_ZK_MAX_RETRIES, BKJM_ZK_MAX_RETRIES_DEFAULT);
        int zkRetryIntervalMs = conf.getInt(BKJM_ZK_RETRY_INTERVAL, BKJM_ZK_RETRY_INTERVAL_DEFAULT);
        CountDownLatch connectLatch = new CountDownLatch(1);
        ConnectionWatcher connectionWatcher = new ConnectionWatcher(connectLatch);
        ZooKeeper zooKeeper = new ZooKeeper(zkConnect, zkSessionTimeoutMs, connectionWatcher);
        // Use twice session timeout as the connection timeout
        int zkConnectTimeoutMs = zkSessionTimeoutMs * 2;

        if (!connectionWatcher.await(zkConnectTimeoutMs)) {
            throw new IOException(
                    "Timed out waiting to connect to " + zkConnect + " after " + (zkSessionTimeoutMs * 2) + " ms.");
        }
        prepareBookKeeperEnv(ledgersAvailablePath, zooKeeper);

        try {
            ClientConfiguration clientConf = new ClientConfiguration();
            clientConf.setClientTcpNoDelay(conf.getBoolean(BKJM_BOOKKEEPER_CLIENT_TCP_NODELAY,
                    BKJM_BOOKKEEPER_CLIENT_TCP_NO_DELAY_DEFAULT));
            clientConf.setThrottleValue(
                    conf.getInt(BKJM_BOOKKEEPER_CLIENT_THROTTLE, BKJM_BOOKKEEPER_CLIENT_THROTTLE_DEFAULT));
            bookKeeperClient = new BookKeeper(clientConf, zooKeeper);
        } catch (KeeperException e) {
            keeperException("Unrecoverable ZooKeeper creating BookKeeper client", e);
            throw new IllegalStateException(e); // never reached
        } catch (InterruptedException e) {
            interruptedException("Interrupted creating a BookKeeper client", e);
            throw new IllegalStateException(e); // never reached
        }
        zk = new RecoveringZooKeeper(new BasicZooKeeper(zooKeeper), zkMaxRetries, zkRetryIntervalMs);
        metadataManager = new BookKeeperJournalMetadataManager(zk, zkParentPath);
        maxTxId = new MaxTxId(zk, maxTxIdPath);
        currentInProgressMetadata = new CurrentInProgressMetadata(zk, currentInProgressPath);
        createZkMetadataIfNotExists(nsInfo);
        metadataManager.init();
    }

    public static void bkException(String msg, BKException e) throws IOException {
        LOG.error(msg, e);
        throw new IOException(msg, e);
    }

    /**
     * Create parent ZNode under which available BookKeeper bookie servers will
     * register themselves. Will create parent ZNodes for that path as well.
     * @see ZkUtils#createFullPathOptimistic(ZooKeeper, String, byte[], List, CreateMode, StringCallback, Object)
     * @param availablePath Full ZooKeeper path for bookies to register
     *                      themselves.
     * @param zooKeeper Fully instantiated ZooKeeper instance.
     * @throws IOException If we are unable to successfully create the path
     *                     during the time specified as the ZooKeeper session
     *                     timeout.
     */
    @VisibleForTesting
    public static void prepareBookKeeperEnv(final String availablePath, ZooKeeper zooKeeper) throws IOException {
        final CountDownLatch availablePathLatch = new CountDownLatch(1);
        StringCallback cb = new StringCallback() {
            @Override
            public void processResult(int rc, String path, Object ctx, String name) {
                if (Code.OK.intValue() == rc || Code.NODEEXISTS.intValue() == rc) {
                    availablePathLatch.countDown();
                    LOG.info("Successfully created bookie available path:" + availablePath);
                } else {
                    Code code = Code.get(rc);
                    LOG.error("Failed to create available bookie path (" + availablePath + ")",
                            KeeperException.create(code, path));
                }
            }
        };
        ZkUtils.createFullPathOptimistic(zooKeeper, availablePath, new byte[0], Ids.OPEN_ACL_UNSAFE,
                CreateMode.PERSISTENT, cb, null);
        try {
            int timeoutMs = zooKeeper.getSessionTimeout();
            if (!availablePathLatch.await(timeoutMs, TimeUnit.MILLISECONDS)) {
                throw new IOException("Couldn't create the bookie available path : " + availablePath
                        + ", timed out after " + timeoutMs + " ms.");
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new IOException("Interrupted when creating the bookie available " + "path: " + availablePath, e);
        }
    }

    /**
     * If environment information has yet not been read during the object's life
     * do so and verify that it has been written the expected protocol version.
     * Additionally, the call always refreshes the object's current
     * {@link CurrentInProgressMetadata} information.
     */
    synchronized private void checkEnv() throws IOException {
        if (!initialized) {
            FormatInfoWritable writable = localFormatInfoWritable.get();
            if (metadataManager.readWritableFromZk(formatInfoPath, writable, null) == null) {
                LOG.error("Environment not initialized (format() not called?)");
                throw new IOException("Environment not initialized (format() not called?");
            }
            if (writable.getProtoVersion() != PROTO_VERSION) {
                throw new IllegalStateException("Wrong protocol version! Expected " + BKJM_BOOKKEEPER_DIGEST_PW
                        + ", but read " + writable.getProtoVersion());
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Namespace info read: " + writable.toColonSeparatedString());
            }
        }
        currentInProgressMetadata.init();
        initialized = true;
    }

    @VisibleForTesting
    public LedgerHandle openForReading(long ledgerId) throws IOException {
        try {
            return bookKeeperClient.openLedgerNoRecovery(ledgerId, BookKeeper.DigestType.MAC, digestPw.getBytes());
        } catch (InterruptedException e) {
            interruptedException("Interrupted while opening ledger id " + ledgerId + " for reading", e);
        } catch (BKException e) {
            bkException("BookKeeper error opening ledger id " + ledgerId + " for reading", e);
        }
        return null; // Should not be reached
    }

    @Override
    public void transitionJournal(StorageInfo si, Transition transition, StartupOption startOpt)
            throws IOException {
        if (transition == Transition.FORMAT) {
            deleteMetadataAndLedgers();
            createZkMetadataIfNotExists(si);
            metadataManager.init();
        } else {
            throw new UnsupportedOperationException();
        }
    }

    /**
     * If ZooKeeper metadata is not empty, forcefully delete the metadata
     * and make a best effort attempt at deleting the ledgers. Used by
     * {@link #formatJournal(StorageInfo)}
     * @throws IOException If there is an error talking to BookKeeper or
     *                     ZooKeeper
     */
    private void deleteMetadataAndLedgers() throws IOException {
        try {
            if (hasSomeJournalData()) {
                if (zkPathExists(metadataManager.getLedgerParentPath())) {
                    for (EditLogLedgerMetadata ledger : metadataManager.listLedgers(true)) {
                        try {
                            // Try to delete the individual ledger from BookKeeper
                            bookKeeperClient.deleteLedger(ledger.getLedgerId());
                        } catch (BKException e) {
                            // It is fine if we are unable to delete the ledger, as it will
                            // not be read and can then be deleted manually.
                            LOG.warn("Unable to delete ledger " + ledger + " from BookKeeper", e);
                        } catch (InterruptedException e) {
                            interruptedException("Interrupted deleting ledger " + ledger, e);
                        }
                    }
                }
                deleteRecursively(zk, zkParentPath);
            }
        } catch (IOException e) {
            LOG.error("Error clearing out metadata under " + zkParentPath, e);
            throw e;
        }
    }

    /**
     * If there is no metadata present in ZooKeeper, create and populate the
     * metadata with the right format information
     * @param si The format information to set
     * @throws IOException If there is an error writing to ZooKeeper
     */
    private void createZkMetadataIfNotExists(StorageInfo si) throws IOException {
        try {
            if (!hasSomeJournalData()) {
                try {
                    // First create the parent path
                    zk.create(zkParentPath, new byte[] { '0' }, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);

                    // Write format/namespace information to ZooKeeper
                    FormatInfoWritable writable = localFormatInfoWritable.get();
                    writable.set(PROTO_VERSION, si);
                    byte[] data = WritableUtil.writableToByteArray(writable);
                    zk.create(formatInfoPath, data, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
                } catch (KeeperException e) {
                    keeperException("Unrecoverable ZooKeeper error initializing " + zkParentPath, e);
                } catch (InterruptedException e) {
                    interruptedException("Interrupted initializing " + zkParentPath + " in ZooKeeper", e);
                }
            }
        } catch (IOException e) {
            LOG.error("Unable to initialize metadata", e);
            throw e;
        }
    }

    /**
     * Check if a path exists in ZooKeeper
     * @param path The ZNode path to check
     * @return True if path exists, false if otherwise
     * @throws IOException If there is an error talking to ZooKeeper
     */
    private boolean zkPathExists(String path) throws IOException {
        try {
            return zk.exists(path, false) != null;
        } catch (KeeperException e) {
            keeperException("Unrecoverable ZooKeeper error checking if " + path + " exists", e);
        } catch (InterruptedException e) {
            interruptedException("Interrupted checking if ZooKeeper path " + path + " exists", e);
        }
        return false; // Should never be reached
    }

    @Override
    public EditLogOutputStream startLogSegment(long txId) throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Trying to start a log segment at txId " + txId);
        }

        checkEnv();

        try {
            long currMaxTxId = maxTxId.get();
            if (txId <= currMaxTxId) {
                throw new IOException("Already saw up to txId " + currMaxTxId + "!");
            }

            String existingInProgress = currentInProgressMetadata.read();
            if (existingInProgress != null && metadataManager.ledgerExists(existingInProgress)) {
                throw new IOException(existingInProgress + " already exists, cannot "
                        + " start a log segment that is already in progress!");
            }
        } catch (IOException e) {
            LOG.error("Unable to start log segment for txId " + txId, e);
            throw e;
        }

        try {
            // There was an error handling on the last stream, so close it
            if (currentInProgressLedger != null) {
                currentInProgressLedger.close();
            }
            currentInProgressLedger = bookKeeperClient.createLedger(ensembleSize, quorumSize,
                    BookKeeper.DigestType.MAC, digestPw.getBytes());
        } catch (BKException e) {
            bkException("BookKeeper error creating ledger for txId " + txId, e);
        } catch (InterruptedException e) {
            interruptedException("Interrupted creating ledger for txId " + txId, e);
        }

        // Create metadata for associated with the edit log segment starting at
        // txId in ZooKeeper
        EditLogLedgerMetadata ledgerMetadata = new EditLogLedgerMetadata(FSConstants.LAYOUT_VERSION,
                currentInProgressLedger.getId(), txId, -1);
        String ledgerFullPath = metadataManager.fullyQualifiedPathForLedger(ledgerMetadata);
        metadataManager.writeEditLogLedgerMetadata(ledgerFullPath, ledgerMetadata);
        maxTxId.store(txId);
        currentInProgressMetadata.update(ledgerFullPath);

        // Used by recoverUnfinalizedSegments()
        currentInProgressPath = ledgerFullPath;

        BookKeeperEditLogOutputStream out = new BookKeeperEditLogOutputStream(currentInProgressLedger, zkParentPath,
                metrics);
        out.create(); // Write the ledger header and flush it to BookKeeper

        InjectionHandler.processEvent(InjectionEvent.BKJM_STARTLOGSEGMENT, ledgerMetadata);
        return out;
    }

    @Override
    public void finalizeLogSegment(long firstTxId, long lastTxId) throws IOException {
        checkEnv();

        try {
            // First, find an in-progress ledger starting at firstTxId
            Versioned<EditLogLedgerMetadata> inProgressMetaAndVersion = metadataManager
                    .findInProgressLedger(firstTxId);

            if (inProgressMetaAndVersion == null) {
                throw new IOException(
                        "Cannot find metadata for an in-progress ledger with first txId " + firstTxId);
            }

            EditLogLedgerMetadata inProgressMeta = inProgressMetaAndVersion.getEntry();

            if (currentInProgressLedger != null) {
                long inProgressLedgerId = currentInProgressLedger.getId();

                if (inProgressMeta.getLedgerId() == inProgressLedgerId) {
                    // If the segment is already // If the segment is currently
                    // in-progress, then finalize the ledger (this ensures every entry
                    // in the ledger committed to the BookKeeper quorum)
                    try {
                        currentInProgressLedger.close();
                    } catch (BKException e) {
                        bkException("Unexpected BookKeeper error closing ledger id " + inProgressLedgerId, e);
                    } catch (InterruptedException e) {
                        interruptedException("Interrupted closing ledger id " + inProgressLedgerId, e);
                    }
                    currentInProgressPath = null;
                    currentInProgressLedger = null;
                } else { // We can not finalize a ledger that is not in-progress
                    throw new IOException("Current in-progress ledger has ledger id (" + inProgressLedgerId
                            + ") different from expected ledger id " + inProgressMeta.getLedgerId());
                }
            }

            // Set lastTxId in the metadata and persist it to ZooKeeper
            EditLogLedgerMetadata finalizedMeta = inProgressMeta.finalizeWithLastTxId(lastTxId);
            String finalizedPath = metadataManager.fullyQualifiedPathForLedger(finalizedMeta);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Attempting to finalize metadata " + finalizedMeta + " to ZNode " + finalizedPath);
            }
            if (!metadataManager.writeEditLogLedgerMetadata(finalizedPath, finalizedMeta)
                    && !metadataManager.verifyEditLogLedgerMetadata(inProgressMeta, finalizedPath)) {
                throw new IOException(
                        "Node " + finalizedPath + " already exists, but data doesn't match " + finalizedMeta);
            }
            maxTxId.store(lastTxId);

            // Find the ZNode path for the metadata associated with the in-progress
            // version of the ledger
            String lastInProgressPath = metadataManager.fullyQualifiedPathForLedger(inProgressMeta);
            String inProgressPathFromCiMeta = currentInProgressMetadata.read();
            if (lastInProgressPath.equals(inProgressPathFromCiMeta)) {
                // If the ZNode path matches the ZNode path for the current in-progress
                // metadata, then clear the current in-progress metadata
                currentInProgressMetadata.clear();
            }

            // Delete the in-progress metadata iff no one else has updated it in
            // the mean while
            if (!metadataManager.deleteLedgerMetadata(inProgressMeta, inProgressMetaAndVersion.getVersion())) {
                throw new IOException("Unable to delete in-progress znode " + lastInProgressPath
                        + " as it no longer exists (Deleted by another process?)");
            }
        } catch (IOException e) {
            LOG.error("Unable to finalized metadata for segment with firstTxId " + firstTxId + ", lastTxId "
                    + lastTxId, e);
            throw e;
        }
    }

    /**
     * An implementation of {@link LedgerHandleProvider} that fences the
     * ledger we are reading from, allowing the ledger to be recovered by
     * BookKeeper as we validate it.
     *
     * @see BookKeeperEditLogInputStream#validateEditLog(LedgerHandleProvider, EditLogLedgerMetadata)
     */
    class FencingLedgerHandleProvider implements LedgerHandleProvider {

        @Override
        public LedgerHandle openForReading(long ledgerId) throws IOException {
            try {
                LOG.info("Opening ledger id " + ledgerId + " for recovery...");

                LedgerHandle lh = bookKeeperClient.openLedger(ledgerId, BookKeeper.DigestType.MAC,
                        digestPw.getBytes());

                if (lh.getId() != ledgerId) { // Verify that correct ledger is opened
                    throw new IllegalStateException(
                            "Ledger id " + lh.getId() + " does not match requested ledger id " + ledgerId);
                }

                LOG.info("Opened ledger id " + ledgerId + " for recovery!");
                return lh;
            } catch (BKException e) {
                bkException("BookKeeper error opening ledger id " + ledgerId + " for recovery", e);
            } catch (InterruptedException e) {
                interruptedException("Interrupted opening ledger id " + ledgerId + "for recovery", e);
            }
            return null;
        }

    }

    @VisibleForTesting
    long validateAndGetEndTxId(EditLogLedgerMetadata ledger) throws IOException {
        return validateAndGetEndTxId(ledger, false);
    }

    long validateAndGetEndTxId(EditLogLedgerMetadata ledger, boolean fence) throws IOException {
        FSEditLogLoader.EditLogValidation val;
        if (!fence) {
            val = BookKeeperEditLogInputStream.validateEditLog(this, ledger);
        } else {
            val = BookKeeperEditLogInputStream.validateEditLog(new FencingLedgerHandleProvider(), ledger);
        }
        InjectionHandler.processEvent(InjectionEvent.BKJM_VALIDATELOGSEGMENT, val);
        if (val.getNumTransactions() == 0) {
            return HdfsConstants.INVALID_TXID; // Ledger is corrupt
        }
        return val.getEndTxId();
    }

    private List<EditLogLedgerMetadata> getLedgers(long fromTxId) throws IOException {
        Collection<EditLogLedgerMetadata> allLedgers = metadataManager.listLedgers(true);
        List<EditLogLedgerMetadata> ledgers = new ArrayList<EditLogLedgerMetadata>();
        for (EditLogLedgerMetadata ledger : allLedgers) {
            if (ledger.getLastTxId() != -1 && fromTxId > ledger.getFirstTxId()
                    && fromTxId <= ledger.getLastTxId()) {
                throw new IOException("Asked for fromTxId " + fromTxId + " which is in the middle of " + ledger);
            }
            if (fromTxId <= ledger.getFirstTxId()) {
                ledgers.add(ledger);
            }
        }

        return ledgers;
    }

    private long findMaxTransaction() throws IOException {
        List<EditLogLedgerMetadata> ledgers = getLedgers(0);
        synchronized (this) {
            for (EditLogLedgerMetadata ledgerMetadata : ledgers) {
                if (ledgerMetadata.getLastTxId() == -1) {
                    maxSeenTxId = Math.max(ledgerMetadata.getFirstTxId(), maxSeenTxId);
                }
                maxSeenTxId = Math.max(ledgerMetadata.getLastTxId(), maxSeenTxId);
            }
        }
        return maxSeenTxId;
    }

    /**
     * For edit log segment that contains transactions with ids earlier than the
     * earliest txid to be retained, remove the ZooKeeper-based metadata and
     * BookKeeper ledgers associated with these segments.
     *
     * @param minTxIdToKeep the earliest txid that must be retained after purging
     *                      old logs
     * @throws IOException If there is an error talking to BookKeeper or
     *                     ZooKeeper
     */
    @Override
    public void purgeLogsOlderThan(long minTxIdToKeep) throws IOException {
        checkEnv();

        Collection<EditLogLedgerMetadata> ledgers = metadataManager.listLedgers(false); // Don't list in-progress ledgers

        for (EditLogLedgerMetadata ledger : ledgers) {
            if (ledger.getFirstTxId() < minTxIdToKeep && ledger.getLastTxId() < minTxIdToKeep) {
                LOG.info("Purging edit log segment: " + ledger);

                // Try to delete the associated ZooKeeper metadata
                if (!metadataManager.deleteLedgerMetadata(ledger, -1)) {
                    // It's possible that another process has already deleted the
                    // metadata
                    LOG.warn(ledger + " has already been purged!");
                } else {
                    try {
                        // Remove the ledger from BookKeeper itself to reclaim diskspace.
                        bookKeeperClient.deleteLedger(ledger.getLedgerId());
                    } catch (BKException e) {
                        bkException("Unrecoverable error deleting " + ledger + " from BookKeeper", e);
                    } catch (InterruptedException e) {
                        interruptedException("Interrupted deleting " + ledger + " from BookKeeper", e);
                    }
                }
            }
        }
    }

    @Override
    public void setCommittedTxId(long txid, boolean force) {
    }

    @Override
    synchronized public void recoverUnfinalizedSegments() throws IOException {
        checkEnv();

        Collection<EditLogLedgerMetadata> allLedgers = metadataManager.listLedgers(true);

        for (EditLogLedgerMetadata ledger : allLedgers) {
            if (ledger.getLastTxId() != -1) {
                continue; // Only un-finalized segments may be recovered
            }

            String ledgerPath = metadataManager.fullyQualifiedPathForLedger(ledger);
            if (currentInProgressPath != null && ledgerPath.equals(currentInProgressPath)) {
                // Do not recover the current in-progress segment
                continue;
            }

            // First open the ledger without fencing in order to check the length
            // of the ledger (to check for any zero-length ledgers that may have
            // been the result of a crash).
            LedgerHandle ledgerHandle = openForReading(ledger.getLedgerId());
            try {
                if (ledgerHandle.getLength() == 0) {
                    handleZeroLengthLedger(ledger); // Delete any zero-length ledgers
                    continue;
                }
            } finally {
                try {
                    ledgerHandle.close();
                } catch (BKException e) {
                    bkException("BookKeeper error closing ledger id " + ledger.getLedgerId(), e);
                } catch (InterruptedException e) {
                    interruptedException("Interrupted closing ledger id " + ledger.getLedgerId(), e);
                }
            }

            // Fence the ledger and validate it as it's being recovered by BookKeeper
            long endTxId = validateAndGetEndTxId(ledger, true);

            findMaxTransaction(); // Update maxTxId seen so far by this instance

            if (endTxId == HdfsConstants.INVALID_TXID) {
                LOG.warn(ledger + "(" + ledgerPath + ")" + " cannot be recovered!");
                metadataManager.moveAsideCorruptLedger(ledger);
                continue;
            }

            // Now finalize the ledger
            finalizeLogSegment(ledger.getFirstTxId(), endTxId);
        }
    }

    private void handleZeroLengthLedger(EditLogLedgerMetadata ledger) throws IOException {
        LOG.warn("In-progress edit log segment " + ledger + " refers to an "
                + "empty edit log segment. This occurs when NameNode crashes after "
                + "opening a segment, but before writing OP_START_LOG_SEGMENT. Will "
                + "delete the ledger and the metadata.");
        if (maxTxId.get() == ledger.getFirstTxId()) {
            LOG.warn("maxTxId is set to " + ledger.getFirstTxId() + " which is "
                    + "belongs to an empty ledger. Resetting to previous maxTxId.");
            maxTxId.set(maxTxId.get() - 1);
        }
        metadataManager.deleteLedgerMetadata(ledger, -1);
        try {
            bookKeeperClient.deleteLedger(ledger.getLedgerId());
        } catch (BKException e) {
            bkException("BookKeeper error deleting empty ledger id " + ledger.getLedgerId(), e);
        } catch (InterruptedException e) {
            interruptedException("Interrupted deleting empty ledger id " + ledger.getLedgerId(), e);
        }
    }

    @Override
    public RemoteEditLogManifest getEditLogManifest(long fromTxId) throws IOException {
        Collection<EditLogLedgerMetadata> ledgers = metadataManager.listLedgers(true);
        LOG.info("Ledgers to include in manifest: " + ledgers);

        List<RemoteEditLog> ret = Lists.newArrayListWithCapacity(ledgers.size());

        for (EditLogLedgerMetadata ledger : ledgers) {
            long endTxId = ledger.getLastTxId();
            boolean isInProgress = endTxId == -1;
            if (isInProgress) {
                endTxId = validateAndGetEndTxId(ledger);
            }

            if (endTxId == HdfsConstants.INVALID_TXID) {
                continue;
            }

            if (ledger.getFirstTxId() >= fromTxId) {
                ret.add(new RemoteEditLog(ledger.getFirstTxId(), endTxId, isInProgress));
            } else if ((fromTxId > ledger.getFirstTxId()) && (fromTxId <= endTxId)) {
                throw new IOException(
                        "Asked for firstTxId " + fromTxId + " which is in the middle of ledger " + ledger);
            }
        }

        Collections.sort(ret);
        return new RemoteEditLogManifest(ret, false);
    }

    private void closeBk() throws IOException {
        try {
            bookKeeperClient.close();
        } catch (BKException e) {
            bkException("Error closing BookKeeper client", e);
        } catch (InterruptedException e) {
            interruptedException("Interrupted closing BookKeeper client ", e);
        }
    }

    private void closeZk() throws IOException {
        try {
            zk.close();
        } catch (InterruptedException e) {
            interruptedException("Interrupted closing ZooKeeper client", e);
        }
    }

    @Override
    public void close() throws IOException {
        try {
            closeBk();
        } finally {
            if (!Thread.currentThread().isInterrupted()) {
                closeZk();
            }
        }
    }

    @Override
    public void selectInputStreams(Collection<EditLogInputStream> streams, long fromTxId, boolean inProgressOk,
            boolean validateInProgressSegments) throws IOException {
        Collection<EditLogLedgerMetadata> allLedgers = getLedgers(fromTxId);
        if (LOG.isDebugEnabled()) {
            LOG.debug(this + ": selecting input streams starting at " + fromTxId
                    + (inProgressOk ? " (inProgress ok) " : "(excluding inProgress) ") + "from among "
                    + allLedgers.size() + " candidate ledger(s).");
        }
        addStreamsToCollectionFromLedgers(allLedgers, streams, fromTxId, inProgressOk, validateInProgressSegments);
    }

    void addStreamsToCollectionFromLedgers(Collection<EditLogLedgerMetadata> allLedgers,
            Collection<EditLogInputStream> streams, long fromTxId, boolean inProgressOk,
            boolean validateInProgressSegments) throws IOException {
        for (EditLogLedgerMetadata ledger : allLedgers) {
            long endTxId = ledger.getLastTxId();
            if (endTxId == -1) {
                if (!inProgressOk) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Passing over " + ledger + " because it is in progress "
                                + " and we are ignoring in-progress logs.");
                        continue;
                    }
                }
                if (validateInProgressSegments) {
                    try {
                        endTxId = validateAndGetEndTxId(ledger);
                    } catch (IOException e) {
                        LOG.error("Got an IOException while trying to validate header of " + ledger + ". Skipping.",
                                e);
                        continue;
                    }
                } else {
                    LOG.info("Skipping validation of edit segment: " + ledger);
                }
            }
            if (endTxId != HdfsConstants.INVALID_TXID && endTxId < fromTxId) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Passing over " + ledger + " because it ends at " + endTxId
                            + ", but we only care about transaction as new as " + fromTxId);
                }
                continue;
            }
            BookKeeperEditLogInputStream bkelis = new BookKeeperEditLogInputStream(this, ledger.getLedgerId(), 0,
                    ledger.getFirstTxId(), endTxId, ledger.getLastTxId() == -1);
            bkelis.setJournalManager(this);
            streams.add(bkelis);
        }
    }

    @Override
    public boolean hasSomeJournalData() throws IOException {
        return zkPathExists(zkParentPath);
    }

    @Override
    public boolean hasSomeImageData() throws IOException {
        return false;
    }

    @Override
    public String toHTMLString() {
        return "BKJM journal";
    }

    @Override
    public boolean hasImageStorage() {
        return false;
    }

    @Override
    public RemoteStorageState analyzeJournalStorage() {
        // TODO
        return null;
    }
}