org.apache.hadoop.yarn.server.sharedcachemanager.CleanerTask.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.yarn.server.sharedcachemanager.CleanerTask.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.server.sharedcachemanager;

import java.io.IOException;
import java.util.concurrent.locks.Lock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.sharedcache.SharedCacheUtil;
import org.apache.hadoop.yarn.server.sharedcachemanager.metrics.CleanerMetrics;
import org.apache.hadoop.yarn.server.sharedcachemanager.store.SCMStore;

/**
 * The task that runs and cleans up the shared cache area for stale entries and
 * orphaned files. It is expected that only one cleaner task runs at any given
 * point in time.
 */
@Private
@Evolving
class CleanerTask implements Runnable {
    private static final String RENAMED_SUFFIX = "-renamed";
    private static final Log LOG = LogFactory.getLog(CleanerTask.class);

    private final String location;
    private final long sleepTime;
    private final int nestedLevel;
    private final Path root;
    private final FileSystem fs;
    private final SCMStore store;
    private final CleanerMetrics metrics;
    private final Lock cleanerTaskLock;

    /**
     * Creates a cleaner task based on the configuration. This is provided for
     * convenience.
     *
     * @param conf
     * @param store
     * @param metrics
     * @param cleanerTaskLock lock that ensures a serial execution of cleaner
     *                        task
     * @return an instance of a CleanerTask
     */
    public static CleanerTask create(Configuration conf, SCMStore store, CleanerMetrics metrics,
            Lock cleanerTaskLock) {
        try {
            // get the root directory for the shared cache
            String location = conf.get(YarnConfiguration.SHARED_CACHE_ROOT,
                    YarnConfiguration.DEFAULT_SHARED_CACHE_ROOT);

            long sleepTime = conf.getLong(YarnConfiguration.SCM_CLEANER_RESOURCE_SLEEP_MS,
                    YarnConfiguration.DEFAULT_SCM_CLEANER_RESOURCE_SLEEP_MS);
            int nestedLevel = SharedCacheUtil.getCacheDepth(conf);
            FileSystem fs = FileSystem.get(conf);

            return new CleanerTask(location, sleepTime, nestedLevel, fs, store, metrics, cleanerTaskLock);
        } catch (IOException e) {
            LOG.error("Unable to obtain the filesystem for the cleaner service", e);
            throw new ExceptionInInitializerError(e);
        }
    }

    /**
     * Creates a cleaner task based on the root directory location and the
     * filesystem.
     */
    CleanerTask(String location, long sleepTime, int nestedLevel, FileSystem fs, SCMStore store,
            CleanerMetrics metrics, Lock cleanerTaskLock) {
        this.location = location;
        this.sleepTime = sleepTime;
        this.nestedLevel = nestedLevel;
        this.root = new Path(location);
        this.fs = fs;
        this.store = store;
        this.metrics = metrics;
        this.cleanerTaskLock = cleanerTaskLock;
    }

    @Override
    public void run() {
        if (!this.cleanerTaskLock.tryLock()) {
            // there is already another task running
            LOG.warn("A cleaner task is already running. " + "This scheduled cleaner task will do nothing.");
            return;
        }

        try {
            if (!fs.exists(root)) {
                LOG.error("The shared cache root " + location + " was not found. "
                        + "The cleaner task will do nothing.");
                return;
            }

            // we're now ready to process the shared cache area
            process();
        } catch (Throwable e) {
            LOG.error("Unexpected exception while initializing the cleaner task. " + "This task will do nothing,",
                    e);
        } finally {
            // this is set to false regardless of if it is a scheduled or on-demand
            // task
            this.cleanerTaskLock.unlock();
        }
    }

    /**
     * Sweeps and processes the shared cache area to clean up stale and orphaned
     * files.
     */
    void process() {
        // mark the beginning of the run in the metrics
        metrics.reportCleaningStart();
        try {
            // now traverse individual directories and process them
            // the directory structure is specified by the nested level parameter
            // (e.g. 9/c/d/<checksum>)
            String pattern = SharedCacheUtil.getCacheEntryGlobPattern(nestedLevel);
            FileStatus[] resources = fs.globStatus(new Path(root, pattern));
            int numResources = resources == null ? 0 : resources.length;
            LOG.info("Processing " + numResources + " resources in the shared cache");
            long beginMs = System.currentTimeMillis();
            if (resources != null) {
                for (FileStatus resource : resources) {
                    // check for interruption so it can abort in a timely manner in case
                    // of shutdown
                    if (Thread.currentThread().isInterrupted()) {
                        LOG.warn("The cleaner task was interrupted. Aborting.");
                        break;
                    }

                    if (resource.isDirectory()) {
                        processSingleResource(resource);
                    } else {
                        LOG.warn("Invalid file at path " + resource.getPath().toString()
                                + " when a directory was expected");
                    }
                    // add sleep time between cleaning each directory if it is non-zero
                    if (sleepTime > 0) {
                        Thread.sleep(sleepTime);
                    }
                }
            }
            long endMs = System.currentTimeMillis();
            long durationMs = endMs - beginMs;
            LOG.info("Processed " + numResources + " resource(s) in " + durationMs + " ms.");
        } catch (IOException e1) {
            LOG.error("Unable to complete the cleaner task", e1);
        } catch (InterruptedException e2) {
            Thread.currentThread().interrupt(); // restore the interrupt
        }
    }

    /**
     * Returns a path for the root directory for the shared cache.
     */
    Path getRootPath() {
        return root;
    }

    /**
     * Processes a single shared cache resource directory.
     */
    void processSingleResource(FileStatus resource) {
        Path path = resource.getPath();
        // indicates the processing status of the resource
        ResourceStatus resourceStatus = ResourceStatus.INIT;

        // first, if the path ends with the renamed suffix, it indicates the
        // directory was moved (as stale) but somehow not deleted (probably due to
        // SCM failure); delete the directory
        if (path.toString().endsWith(RENAMED_SUFFIX)) {
            LOG.info("Found a renamed directory that was left undeleted at " + path.toString() + ". Deleting.");
            try {
                if (fs.delete(path, true)) {
                    resourceStatus = ResourceStatus.DELETED;
                }
            } catch (IOException e) {
                LOG.error("Error while processing a shared cache resource: " + path, e);
            }
        } else {
            // this is the path to the cache resource directory
            // the directory name is the resource key (i.e. a unique identifier)
            String key = path.getName();

            try {
                store.cleanResourceReferences(key);
            } catch (YarnException e) {
                LOG.error("Exception thrown while removing dead appIds.", e);
            }

            if (store.isResourceEvictable(key, resource)) {
                try {
                    /*
                     * TODO See YARN-2663: There is a race condition between
                     * store.removeResource(key) and
                     * removeResourceFromCacheFileSystem(path) operations because they do
                     * not happen atomically and resources can be uploaded with different
                     * file names by the node managers.
                     */
                    // remove the resource from scm (checks for appIds as well)
                    if (store.removeResource(key)) {
                        // remove the resource from the file system
                        boolean deleted = removeResourceFromCacheFileSystem(path);
                        if (deleted) {
                            resourceStatus = ResourceStatus.DELETED;
                        } else {
                            LOG.error("Failed to remove path from the file system." + " Skipping this resource: "
                                    + path);
                            resourceStatus = ResourceStatus.ERROR;
                        }
                    } else {
                        // we did not delete the resource because it contained application
                        // ids
                        resourceStatus = ResourceStatus.PROCESSED;
                    }
                } catch (IOException e) {
                    LOG.error("Failed to remove path from the file system. Skipping this resource: " + path, e);
                    resourceStatus = ResourceStatus.ERROR;
                }
            } else {
                resourceStatus = ResourceStatus.PROCESSED;
            }
        }

        // record the processing
        switch (resourceStatus) {
        case DELETED:
            metrics.reportAFileDelete();
            break;
        case PROCESSED:
            metrics.reportAFileProcess();
            break;
        case ERROR:
            metrics.reportAFileError();
            break;
        default:
            LOG.error("Cleaner encountered an invalid status (" + resourceStatus + ") while processing resource: "
                    + path.getName());
        }
    }

    private boolean removeResourceFromCacheFileSystem(Path path) throws IOException {
        // rename the directory to make the delete atomic
        Path renamedPath = new Path(path.toString() + RENAMED_SUFFIX);
        if (fs.rename(path, renamedPath)) {
            // the directory can be removed safely now
            // log the original path
            LOG.info("Deleting " + path.toString());
            return fs.delete(renamedPath, true);
        } else {
            // we were unable to remove it for some reason: it's best to leave
            // it at that
            LOG.error("We were not able to rename the directory to " + renamedPath.toString()
                    + ". We will leave it intact.");
        }
        return false;
    }

    /**
     * A status indicating what happened with the processing of a given cache
     * resource.
     */
    private enum ResourceStatus {
        INIT,
        /** Resource was successfully processed, but not deleted **/
        PROCESSED,
        /** Resource was successfully deleted **/
        DELETED,
        /** The cleaner task ran into an error while processing the resource **/
        ERROR
    }
}