org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.java Source code

Introduction

Here is the source code for org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.network.shuffle;

import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.collect.Maps;
import org.fusesource.leveldbjni.JniDBFactory;
import org.fusesource.leveldbjni.internal.NativeDB;
import org.iq80.leveldb.DB;
import org.iq80.leveldb.DBIterator;
import org.iq80.leveldb.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.spark.network.buffer.FileSegmentManagedBuffer;
import org.apache.spark.network.buffer.ManagedBuffer;
import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
import org.apache.spark.network.util.JavaUtils;
import org.apache.spark.network.util.NettyUtils;
import org.apache.spark.network.util.TransportConf;

/**
 * Manages converting shuffle BlockIds into physical segments of local files, from a process outside
 * of Executors. Each Executor must register its own configuration about where it stores its files
 * (local dirs) and how (shuffle manager). The logic for retrieval of individual files is replicated
 * from Spark's FileShuffleBlockResolver and IndexShuffleBlockResolver.
 */
public class ExternalShuffleBlockResolver {
    private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class);

    private static final ObjectMapper mapper = new ObjectMapper();
    /**
     * This a common prefix to the key for each app registration we stick in leveldb, so they
     * are easy to find, since leveldb lets you search based on prefix.
     */
    private static final String APP_KEY_PREFIX = "AppExecShuffleInfo";
    private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0);

    // Map containing all registered executors' metadata.
    @VisibleForTesting
    final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;

    // Single-threaded Java executor used to perform expensive recursive directory deletion.
    private final Executor directoryCleaner;

    private final TransportConf conf;

    @VisibleForTesting
    final File registeredExecutorFile;
    @VisibleForTesting
    final DB db;

    public ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorFile) throws IOException {
        this(conf, registeredExecutorFile, Executors.newSingleThreadExecutor(
                // Add `spark` prefix because it will run in NM in Yarn mode.
                NettyUtils.createThreadFactory("spark-shuffle-directory-cleaner")));
    }

    // Allows tests to have more control over when directories are cleaned up.
    @VisibleForTesting
    ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorFile, Executor directoryCleaner)
            throws IOException {
        this.conf = conf;
        this.registeredExecutorFile = registeredExecutorFile;
        if (registeredExecutorFile != null) {
            Options options = new Options();
            options.createIfMissing(false);
            options.logger(new LevelDBLogger());
            DB tmpDb;
            try {
                tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
            } catch (NativeDB.DBException e) {
                if (e.isNotFound() || e.getMessage().contains(" does not exist ")) {
                    logger.info("Creating state database at " + registeredExecutorFile);
                    options.createIfMissing(true);
                    try {
                        tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
                    } catch (NativeDB.DBException dbExc) {
                        throw new IOException("Unable to create state store", dbExc);
                    }
                } else {
                    // the leveldb file seems to be corrupt somehow.  Lets just blow it away and create a new
                    // one, so we can keep processing new apps
                    logger.error("error opening leveldb file {}.  Creating new file, will not be able to "
                            + "recover state for existing applications", registeredExecutorFile, e);
                    if (registeredExecutorFile.isDirectory()) {
                        for (File f : registeredExecutorFile.listFiles()) {
                            if (!f.delete()) {
                                logger.warn("error deleting {}", f.getPath());
                            }
                        }
                    }
                    if (!registeredExecutorFile.delete()) {
                        logger.warn("error deleting {}", registeredExecutorFile.getPath());
                    }
                    options.createIfMissing(true);
                    try {
                        tmpDb = JniDBFactory.factory.open(registeredExecutorFile, options);
                    } catch (NativeDB.DBException dbExc) {
                        throw new IOException("Unable to create state store", dbExc);
                    }

                }
            }
            // if there is a version mismatch, we throw an exception, which means the service is unusable
            checkVersion(tmpDb);
            executors = reloadRegisteredExecutors(tmpDb);
            db = tmpDb;
        } else {
            db = null;
            executors = Maps.newConcurrentMap();
        }
        this.directoryCleaner = directoryCleaner;
    }

    /** Registers a new Executor with all the configuration we need to find its shuffle files. */
    public void registerExecutor(String appId, String execId, ExecutorShuffleInfo executorInfo) {
        AppExecId fullId = new AppExecId(appId, execId);
        logger.info("Registered executor {} with {}", fullId, executorInfo);
        try {
            if (db != null) {
                byte[] key = dbAppExecKey(fullId);
                byte[] value = mapper.writeValueAsString(executorInfo).getBytes(Charsets.UTF_8);
                db.put(key, value);
            }
        } catch (Exception e) {
            logger.error("Error saving registered executors", e);
        }
        executors.put(fullId, executorInfo);
    }

    /**
     * Obtains a FileSegmentManagedBuffer from a shuffle block id. We expect the blockId has the
     * format "shuffle_ShuffleId_MapId_ReduceId" (from ShuffleBlockId), and additionally make
     * assumptions about how the hash and sort based shuffles store their data.
     */
    public ManagedBuffer getBlockData(String appId, String execId, String blockId) {
        String[] blockIdParts = blockId.split("_");
        if (blockIdParts.length < 4) {
            throw new IllegalArgumentException("Unexpected block id format: " + blockId);
        } else if (!blockIdParts[0].equals("shuffle")) {
            throw new IllegalArgumentException("Expected shuffle block id, got: " + blockId);
        }
        int shuffleId = Integer.parseInt(blockIdParts[1]);
        int mapId = Integer.parseInt(blockIdParts[2]);
        int reduceId = Integer.parseInt(blockIdParts[3]);

        ExecutorShuffleInfo executor = executors.get(new AppExecId(appId, execId));
        if (executor == null) {
            throw new RuntimeException(
                    String.format("Executor is not registered (appId=%s, execId=%s)", appId, execId));
        }

        if ("sort".equals(executor.shuffleManager) || "tungsten-sort".equals(executor.shuffleManager)) {
            return getSortBasedShuffleBlockData(executor, shuffleId, mapId, reduceId);
        } else if ("hash".equals(executor.shuffleManager)) {
            return getHashBasedShuffleBlockData(executor, blockId);
        } else {
            throw new UnsupportedOperationException("Unsupported shuffle manager: " + executor.shuffleManager);
        }
    }

    /**
     * Removes our metadata of all executors registered for the given application, and optionally
     * also deletes the local directories associated with the executors of that application in a
     * separate thread.
     *
     * It is not valid to call registerExecutor() for an executor with this appId after invoking
     * this method.
     */
    public void applicationRemoved(String appId, boolean cleanupLocalDirs) {
        logger.info("Application {} removed, cleanupLocalDirs = {}", appId, cleanupLocalDirs);
        Iterator<Map.Entry<AppExecId, ExecutorShuffleInfo>> it = executors.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry<AppExecId, ExecutorShuffleInfo> entry = it.next();
            AppExecId fullId = entry.getKey();
            final ExecutorShuffleInfo executor = entry.getValue();

            // Only touch executors associated with the appId that was removed.
            if (appId.equals(fullId.appId)) {
                it.remove();
                if (db != null) {
                    try {
                        db.delete(dbAppExecKey(fullId));
                    } catch (IOException e) {
                        logger.error("Error deleting {} from executor state db", appId, e);
                    }
                }

                if (cleanupLocalDirs) {
                    logger.info("Cleaning up executor {}'s {} local dirs", fullId, executor.localDirs.length);

                    // Execute the actual deletion in a different thread, as it may take some time.
                    directoryCleaner.execute(new Runnable() {
                        @Override
                        public void run() {
                            deleteExecutorDirs(executor.localDirs);
                        }
                    });
                }
            }
        }
    }

    /**
     * Synchronously deletes each directory one at a time.
     * Should be executed in its own thread, as this may take a long time.
     */
    private void deleteExecutorDirs(String[] dirs) {
        for (String localDir : dirs) {
            try {
                JavaUtils.deleteRecursively(new File(localDir));
                logger.debug("Successfully cleaned up directory: " + localDir);
            } catch (Exception e) {
                logger.error("Failed to delete directory: " + localDir, e);
            }
        }
    }

    /**
     * Hash-based shuffle data is simply stored as one file per block.
     * This logic is from FileShuffleBlockResolver.
     */
    private ManagedBuffer getHashBasedShuffleBlockData(ExecutorShuffleInfo executor, String blockId) {
        File shuffleFile = getFile(executor.localDirs, executor.subDirsPerLocalDir, blockId);
        return new FileSegmentManagedBuffer(conf, shuffleFile, 0, shuffleFile.length());
    }

    /**
     * Sort-based shuffle data uses an index called "shuffle_ShuffleId_MapId_0.index" into a data file
     * called "shuffle_ShuffleId_MapId_0.data". This logic is from IndexShuffleBlockResolver,
     * and the block id format is from ShuffleDataBlockId and ShuffleIndexBlockId.
     */
    private ManagedBuffer getSortBasedShuffleBlockData(ExecutorShuffleInfo executor, int shuffleId, int mapId,
            int reduceId) {
        File indexFile = getFile(executor.localDirs, executor.subDirsPerLocalDir,
                "shuffle_" + shuffleId + "_" + mapId + "_0.index");

        DataInputStream in = null;
        try {
            in = new DataInputStream(new FileInputStream(indexFile));
            in.skipBytes(reduceId * 8);
            long offset = in.readLong();
            long nextOffset = in.readLong();
            return new FileSegmentManagedBuffer(conf, getFile(executor.localDirs, executor.subDirsPerLocalDir,
                    "shuffle_" + shuffleId + "_" + mapId + "_0.data"), offset, nextOffset - offset);
        } catch (IOException e) {
            throw new RuntimeException("Failed to open file: " + indexFile, e);
        } finally {
            if (in != null) {
                JavaUtils.closeQuietly(in);
            }
        }
    }

    /**
     * Hashes a filename into the corresponding local directory, in a manner consistent with
     * Spark's DiskBlockManager.getFile().
     */
    @VisibleForTesting
    static File getFile(String[] localDirs, int subDirsPerLocalDir, String filename) {
        int hash = JavaUtils.nonNegativeHash(filename);
        String localDir = localDirs[hash % localDirs.length];
        int subDirId = (hash / localDirs.length) % subDirsPerLocalDir;
        return new File(new File(localDir, String.format("%02x", subDirId)), filename);
    }

    void close() {
        if (db != null) {
            try {
                db.close();
            } catch (IOException e) {
                logger.error("Exception closing leveldb with registered executors", e);
            }
        }
    }

    /** Simply encodes an executor's full ID, which is appId + execId. */
    public static class AppExecId {
        public final String appId;
        public final String execId;

        @JsonCreator
        public AppExecId(@JsonProperty("appId") String appId, @JsonProperty("execId") String execId) {
            this.appId = appId;
            this.execId = execId;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;

            AppExecId appExecId = (AppExecId) o;
            return Objects.equal(appId, appExecId.appId) && Objects.equal(execId, appExecId.execId);
        }

        @Override
        public int hashCode() {
            return Objects.hashCode(appId, execId);
        }

        @Override
        public String toString() {
            return Objects.toStringHelper(this).add("appId", appId).add("execId", execId).toString();
        }
    }

    private static byte[] dbAppExecKey(AppExecId appExecId) throws IOException {
        // we stick a common prefix on all the keys so we can find them in the DB
        String appExecJson = mapper.writeValueAsString(appExecId);
        String key = (APP_KEY_PREFIX + ";" + appExecJson);
        return key.getBytes(Charsets.UTF_8);
    }

    private static AppExecId parseDbAppExecKey(String s) throws IOException {
        if (!s.startsWith(APP_KEY_PREFIX)) {
            throw new IllegalArgumentException("expected a string starting with " + APP_KEY_PREFIX);
        }
        String json = s.substring(APP_KEY_PREFIX.length() + 1);
        AppExecId parsed = mapper.readValue(json, AppExecId.class);
        return parsed;
    }

    @VisibleForTesting
    static ConcurrentMap<AppExecId, ExecutorShuffleInfo> reloadRegisteredExecutors(DB db) throws IOException {
        ConcurrentMap<AppExecId, ExecutorShuffleInfo> registeredExecutors = Maps.newConcurrentMap();
        if (db != null) {
            DBIterator itr = db.iterator();
            itr.seek(APP_KEY_PREFIX.getBytes(Charsets.UTF_8));
            while (itr.hasNext()) {
                Map.Entry<byte[], byte[]> e = itr.next();
                String key = new String(e.getKey(), Charsets.UTF_8);
                if (!key.startsWith(APP_KEY_PREFIX)) {
                    break;
                }
                AppExecId id = parseDbAppExecKey(key);
                ExecutorShuffleInfo shuffleInfo = mapper.readValue(e.getValue(), ExecutorShuffleInfo.class);
                registeredExecutors.put(id, shuffleInfo);
            }
        }
        return registeredExecutors;
    }

    private static class LevelDBLogger implements org.iq80.leveldb.Logger {
        private static final Logger LOG = LoggerFactory.getLogger(LevelDBLogger.class);

        @Override
        public void log(String message) {
            LOG.info(message);
        }
    }

    /**
     * Simple major.minor versioning scheme.  Any incompatible changes should be across major
     * versions.  Minor version differences are allowed -- meaning we should be able to read
     * dbs that are either earlier *or* later on the minor version.
     */
    private static void checkVersion(DB db) throws IOException {
        byte[] bytes = db.get(StoreVersion.KEY);
        if (bytes == null) {
            storeVersion(db);
        } else {
            StoreVersion version = mapper.readValue(bytes, StoreVersion.class);
            if (version.major != CURRENT_VERSION.major) {
                throw new IOException("cannot read state DB with version " + version + ", incompatible "
                        + "with current version " + CURRENT_VERSION);
            }
            storeVersion(db);
        }
    }

    private static void storeVersion(DB db) throws IOException {
        db.put(StoreVersion.KEY, mapper.writeValueAsBytes(CURRENT_VERSION));
    }

    public static class StoreVersion {

        static final byte[] KEY = "StoreVersion".getBytes(Charsets.UTF_8);

        public final int major;
        public final int minor;

        @JsonCreator
        public StoreVersion(@JsonProperty("major") int major, @JsonProperty("minor") int minor) {
            this.major = major;
            this.minor = minor;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;

            StoreVersion that = (StoreVersion) o;

            return major == that.major && minor == that.minor;
        }

        @Override
        public int hashCode() {
            int result = major;
            result = 31 * result + minor;
            return result;
        }
    }

}