com.mellanox.hadoop.mapred.MapOutputLocation.java Source code

Java tutorial

Introduction

Here is the source code for com.mellanox.hadoop.mapred.MapOutputLocation.java

Source

/*
** Copyright (C) 2012 Auburn University
** Copyright (C) 2012 Mellanox Technologies
** 
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at:
**  
** http://www.apache.org/licenses/LICENSE-2.0
** 
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
** either express or implied. See the License for the specific language 
** governing permissions and  limitations under the License.
**
**
*/
package com.mellanox.hadoop.mapred;

import java.io.File;
import java.io.IOException;

import java.net.URI;
import java.net.URL;
import java.net.URLClassLoader;

import java.util.Set;
import java.util.TreeSet;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; // TODO: probably concurrency is not needed 

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.ShuffleConsumerPlugin;
import org.apache.hadoop.mapred.RawKeyValueIterator;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapred.MapTaskCompletionEventsUpdate;
import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
import org.apache.hadoop.mapred.ReduceTask;
import org.apache.hadoop.mapred.Task;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.mapred.UdaMapredBridge;

/**
   * Abstraction to track a map-output.
*/
class MapOutputLocation {
    TaskAttemptID taskAttemptId;
    TaskID taskId;
    String ttHost;
    URL taskOutput;

    public MapOutputLocation(TaskAttemptID taskAttemptId, String ttHost, URL taskOutput) {
        this.taskAttemptId = taskAttemptId;
        this.taskId = this.taskAttemptId.getTaskID();
        this.ttHost = ttHost;
        this.taskOutput = taskOutput;
    }

    public TaskAttemptID getTaskAttemptId() {
        return taskAttemptId;
    }

    public TaskID getTaskId() {
        return taskId;
    }

    public String getHost() {
        return ttHost;
    }

    public URL getOutputLocation() {
        return taskOutput;
    }
}

interface UdaConsumerPluginCallable {
    public boolean pluginFetchOutputs(ShuffleConsumerPlugin plugin) throws IOException;

    public RawKeyValueIterator pluginCreateKVIterator(ShuffleConsumerPlugin plugin, JobConf job, FileSystem fs,
            Reporter reporter) throws IOException, InterruptedException;

    public Class getVanillaPluginClass();

    public MapTaskCompletionEventsUpdate pluginGetMapCompletionEvents(IntWritable fromEventId, int maxEventsToFetch)
            throws IOException;
}

/*
     MapTaskCompletionEventsUpdate update = 
     umbilical.getMapCompletionEvents(reduceTask.getJobID(), 
     fromEventId.get(), 
     MAX_EVENTS_TO_FETCH,
     reduceTask.getTaskID()
     //*, reduceTask.getJvmContext() - this was for hadoop-1.x
     );
         
       
//*/

class UdaShuffleConsumerPluginShared<K, V> implements UdaConsumerPluginCallable {

    UdaConsumerPluginCallable udaConsumerPluginCallable;

    UdaShuffleConsumerPluginShared(UdaConsumerPluginCallable udaConsumerPluginCallable) {
        this.udaConsumerPluginCallable = udaConsumerPluginCallable;
    }

    public boolean pluginFetchOutputs(ShuffleConsumerPlugin plugin) throws IOException {
        return this.udaConsumerPluginCallable.pluginFetchOutputs(plugin);
    }

    public RawKeyValueIterator pluginCreateKVIterator(ShuffleConsumerPlugin plugin, JobConf job, FileSystem fs,
            Reporter reporter) throws IOException, InterruptedException {
        return this.udaConsumerPluginCallable.pluginCreateKVIterator(plugin, job, fs, reporter);
    }

    public Class getVanillaPluginClass() {
        return this.udaConsumerPluginCallable.getVanillaPluginClass();
    }

    public MapTaskCompletionEventsUpdate pluginGetMapCompletionEvents(IntWritable fromEventId, int maxEventsToFetch)
            throws IOException {
        return this.udaConsumerPluginCallable.pluginGetMapCompletionEvents(fromEventId, maxEventsToFetch);
    }

    protected ReduceTask reduceTask;
    protected TaskAttemptID reduceId;
    protected TaskUmbilicalProtocol umbilical; // Reference to the umbilical object
    protected JobConf jobConf;
    protected Reporter reporter;
    protected FileSystem fs = null;

    /*private*/ static final Log LOG = LogFactory.getLog(ShuffleConsumerPlugin.class.getCanonicalName());

    // This is the channel used to transfer the data between RDMA C++ and Hadoop
    /*private*/ UdaPluginRT rdmaChannel;

    ShuffleConsumerPlugin fallbackPlugin = null;

    // let other thread wake up fetchOutputs upon completion (either success of failure)
    private Object fetchLock = new Object();

    void notifyFetchCompleted() {
        synchronized (fetchLock) {
            fetchLock.notify();
        }
    }

    // called outside the RT thread, usually by a UDA C++ thread
    void failureInUda(Throwable t) {

        if (LOG.isDebugEnabled())
            LOG.debug("failureInUda");

        try {
            doFallbackInit(t);

            // wake up fetchOutputs
            synchronized (fetchLock) {
                fetchLock.notify();
            }
        } catch (Throwable t2) {
            throw new UdaRuntimeException("Failure in UDA and failure when trying to fallback to vanilla", t2);
        }
    }

    public void init(ReduceTask reduceTask, TaskUmbilicalProtocol umbilical, JobConf conf, Reporter reporter)
            throws IOException {

        try {
            LOG.info("init - Using UdaShuffleConsumerPlugin");
            this.reduceTask = reduceTask;
            this.reduceId = reduceTask.getTaskID();

            this.umbilical = umbilical;
            this.jobConf = conf;
            this.reporter = reporter;

            this.rdmaChannel = new UdaPluginRT<K, V>(this, reduceTask, jobConf, reporter, reduceTask.getNumMaps());
        } catch (Throwable t) {
            doFallbackInit(t);
        }
    }

    public void init(ReduceTask reduceTask, TaskUmbilicalProtocol umbilical, JobConf conf, Reporter reporter,
            FileSystem fs) throws IOException {
        init(reduceTask, umbilical, conf, reporter);
        this.fs = fs;
    }

    synchronized /*private*/ void doFallbackInit(Throwable t) throws IOException {
        if (fallbackPlugin != null)
            return; // already done

        exitGetMapEvents = true; //sanity
        String devModeProperty = "mapred.rdma.developer.mode";
        LOG.info("checking " + devModeProperty + "...");
        if (jobConf.getBoolean(devModeProperty, false)) {
            LOG.fatal("Got UDA Fatal Error and cannot fallback to Vanilla since I am under " + devModeProperty
                    + ". Aborting...\n" + StringUtils.stringifyException(t));
            // throw( new UdaRuntimeException("Got UDA Fatal Error and cannot fallback to Vanilla since I am under " + devModeProperty, t) );
            System.exit(1);
        }

        if (t != null) {
            LOG.error(
                    "Critical failure has occured in UdaPlugin - We'll try to use vanilla as fallbackPlugin. \n\tException is:"
                            + StringUtils.stringifyException(t));
        }

        try {
            fallbackPlugin = UdaMapredBridge.getShuffleConsumerPlugin(getVanillaPluginClass(), reduceTask,
                    umbilical, jobConf, reporter);
            LOG.info("Succesfuly switched to Using fallbackPlugin");
        } catch (ClassNotFoundException e) {
            UdaRuntimeException ure = new UdaRuntimeException(
                    "Failed to initialize UDA Shuffle and failed to fallback to vanilla Shuffle because of ClassNotFoundException",
                    e);
            ure.setStackTrace(e.getStackTrace());
            throw ure;
        }
    }

    boolean fallbackFetchOutputsDone = false;

    synchronized private boolean doFallbackFetchOutputs() throws IOException {
        if (fallbackFetchOutputsDone)
            return true; // already done

        doFallbackInit(null); // sanity
        fallbackFetchOutputsDone = pluginFetchOutputs(fallbackPlugin);
        return fallbackFetchOutputsDone;
    }

    /** 
       * A flag to indicate when to exit getMapEvents thread 
    */
    protected volatile boolean exitGetMapEvents = false;

    boolean fetchOutputsCompleted = false;

    private boolean fetchOutputsInternal() throws IOException {
        GetMapEventsThread getMapEventsThread = null;
        // start the map events thread
        getMapEventsThread = new GetMapEventsThread();
        getMapEventsThread.start();

        LOG.info("fetchOutputs - Using UdaShuffleConsumerPlugin");
        synchronized (fetchLock) {
            try {
                fetchLock.wait();
            } catch (InterruptedException e) {
            }
        }
        // all done, inform the copiers to exit
        exitGetMapEvents = true;
        if (LOG.isDebugEnabled())
            LOG.debug("Fetching finished");

        if (fallbackPlugin != null) {
            LOG.warn("another thread has indicated Uda failure");
            throw new UdaRuntimeException("another thread has indicated Uda failure");
        }
        try {
            //here only stop the thread, but don't close it, 
            //because we need this channel to return the values later.
            getMapEventsThread.join();
            LOG.info("getMapsEventsThread joined.");
        } catch (InterruptedException ie) {
            LOG.info("getMapsEventsThread/rdmaChannelThread threw an exception: "
                    + StringUtils.stringifyException(ie));
        }
        fetchOutputsCompleted = true;
        return true;
    }

    //    @Override
    public boolean fetchOutputs() throws IOException {

        try {
            if (fallbackPlugin == null) {
                return fetchOutputsInternal();
            }
        } catch (Throwable t) {
            doFallbackInit(t);
        }

        LOG.info("fetchOutputs: Using fallbackPlugin");
        return doFallbackFetchOutputs();
    }

    //playback of fetchOutputs from other thread - will handle error return to exception like RT does
    private void doPlaybackFetchOutputs() throws IOException {

        LOG.info("doPlaybackFetchOutputs: Using fallbackPlugin");

        // error handling code copied from ReduceTask.java
        if (!doFallbackFetchOutputs()) {

            /* - commented out till mergeThrowable is accessible - requires change in the patch                        
                     if(fallbackPlugin.mergeThrowable instanceof FSError) {
                        throw (FSError)fallbackPlugin.mergeThrowable;
                     }
                     throw new IOException("Task: " + reduceTask.getTaskID() + 
                           " - The reduce copier failed", fallbackPlugin.mergeThrowable);
            //*/
            throw new IOException("Task: " + reduceTask.getTaskID() + " - The reduce copier failed");
        }
    }

    //    @Override
    public RawKeyValueIterator createKVIterator(JobConf job, FileSystem fs, Reporter reporter) throws IOException {

        try {
            if (fetchOutputsCompleted) {
                LOG.info("createKVIterator - Using UdaShuffleConsumerPlugin");
                return this.rdmaChannel.createKVIterator_rdma(job, fs, reporter);
            }
        } catch (Throwable t) {
            doFallbackInit(t);
        }

        if (!fallbackFetchOutputsDone)
            doPlaybackFetchOutputs();//this will also playback init - if needed

        LOG.info("createKVIterator: Using fallbackPlugin");
        try {
            return pluginCreateKVIterator(fallbackPlugin, job, fs, reporter);
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }

    private class UdaCloserThread extends Thread {
        UdaPluginRT rdmaChannel;

        public UdaCloserThread(UdaPluginRT rdmaChannel) {
            this.rdmaChannel = rdmaChannel;
            setName("UdaCloserThread");
            setDaemon(true);
        }

        @Override
        public void run() {
            LOG.info(reduceTask.getTaskID() + " Thread started: " + getName());
            if (rdmaChannel == null) {
                LOG.warn("rdmaChannel == null");
            } else {
                LOG.info("--->>> closing UdaShuffleConsumerPlugin");
                rdmaChannel.close();
                LOG.info("<<<--- UdaShuffleConsumerPlugin was closed");
            }
            LOG.info(reduceTask.getTaskID() + " Thread finished: " + getName());
        }
    }

    //    @Override
    public void close() {
        // try catch here is not needed since it is too late for new fallback to vanilla.
        if (fallbackPlugin == null) {
            LOG.info("close - Using UdaShuffleConsumerPlugin");
            this.rdmaChannel.close();
            LOG.info("====XXX Successfully closed UdaShuffleConsumerPlugin XXX====");

            return;
        }

        LOG.info("close: Using fallbackPlugin");
        fallbackPlugin.close();

        // also close UdaPlugin including C++
        UdaCloserThread udaCloserThread = new UdaCloserThread(rdmaChannel);
        udaCloserThread.start();
        try {
            udaCloserThread.join(1000); // wait up to 1 second for the udaCloserThread
        } catch (InterruptedException e) {
            LOG.info("InterruptedException on udaCloserThread.join");
        }
        LOG.info("====XXX Successfully closed fallbackPlugin XXX====");
    }

    //*   
    protected void configureClasspath(JobConf conf) throws IOException {

        // get the task and the current classloader which will become the parent
        Task task = reduceTask;
        ClassLoader parent = conf.getClassLoader();

        // get the work directory which holds the elements we are dynamically
        // adding to the classpath
        File workDir = new File(task.getJobFile()).getParentFile();
        ArrayList<URL> urllist = new ArrayList<URL>();

        // add the jars and directories to the classpath
        String jar = conf.getJar();
        if (jar != null) {
            File jobCacheDir = new File(new Path(jar).getParent().toString());

            File[] libs = new File(jobCacheDir, "lib").listFiles();
            if (libs != null) {
                for (int i = 0; i < libs.length; i++) {
                    urllist.add(libs[i].toURL());
                }
            }
            urllist.add(new File(jobCacheDir, "classes").toURL());
            urllist.add(jobCacheDir.toURL());

        }
        urllist.add(workDir.toURL());

        // create a new classloader with the old classloader as its parent
        // then set that classloader as the one used by the current jobconf
        URL[] urls = urllist.toArray(new URL[urllist.size()]);
        URLClassLoader loader = new URLClassLoader(urls, parent);
        conf.setClassLoader(loader);
    }
    //*/

    private class GetMapEventsThread extends Thread {

        private IntWritable fromEventId = new IntWritable(0);
        private static final long SLEEP_TIME = 1000;

        public GetMapEventsThread() {
            setName("Thread for polling Map Completion Events");
            setDaemon(true);
        }

        @Override
        public void run() {

            LOG.info(reduceTask.getTaskID() + " Thread started: " + getName());

            do {
                try {
                    int numNewMaps = getMapCompletionEvents();
                    if (numNewMaps > 0) {
                        //              synchronized (copyResultsOrNewEventsLock) {
                        //                numEventsFetched += numNewMaps;
                        //                copyResultsOrNewEventsLock.notifyAll();
                        //              }
                    }
                    if (LOG.isDebugEnabled()) {
                        if (numNewMaps > 0) {
                            LOG.debug(reduceTask.getTaskID() + ": " + "Got " + numNewMaps + " new map-outputs");
                        }
                    }
                    Thread.sleep(SLEEP_TIME);
                } catch (InterruptedException e) {
                    LOG.warn(reduceTask.getTaskID() + " GetMapEventsThread returning after an "
                            + " interrupted exception");
                    return;
                    //TODO: do we want fallback to vanilla??
                } catch (Throwable t) {
                    /*               
                                   String msg = reduceTask.getTaskID()
                                   + " GetMapEventsThread Ignoring exception : " 
                                   + StringUtils.stringifyException(t);
                                   pluginReportFatalError(reduceTask, reduceTask.getTaskID(), t, msg);
                    //*/
                    LOG.error("error in GetMapEventsThread");
                    failureInUda(t);
                    break;
                }
            } while (!exitGetMapEvents);

            LOG.info("GetMapEventsThread exiting");

        }

        /** Max events to fetch in one go from the tasktracker */
        private static final int MAX_EVENTS_TO_FETCH = 10000;

        /**
           * The map for (Hosts, List of MapIds from this Host) maintaining
           * map output locations
        */
        private final Map<String, List<MapOutputLocation>> mapLocations = new ConcurrentHashMap<String, List<MapOutputLocation>>();

        /** 
           * Queries the {@link TaskTracker} for a set of map-completion events 
           * from a given event ID.
           * @throws IOException
        */
        private int getMapCompletionEvents() throws IOException {

            int numNewMaps = 0;
            MapTaskCompletionEventsUpdate update = pluginGetMapCompletionEvents(fromEventId, MAX_EVENTS_TO_FETCH);

            TaskCompletionEvent events[] = update.getMapTaskCompletionEvents();

            Set<TaskID> succeededTasks = new TreeSet<TaskID>();
            Set<TaskAttemptID> succeededAttempts = new TreeSet<TaskAttemptID>();

            // Check if the reset is required.
            // Since there is no ordering of the task completion events at the 
            // reducer, the only option to sync with the new jobtracker is to reset 
            // the events index
            if (update.shouldReset()) {
                fromEventId.set(0);
                //          obsoleteMapIds.clear(); // clear the obsolete map
                //          mapLocations.clear(); // clear the map locations mapping

                if (succeededTasks.isEmpty()) {
                    //ignore
                    LOG.info("got reset update before we had any succeeded map - this is OK");
                } else {
                    //fallback         
                    throw new UdaRuntimeException(
                            "got reset update, after " + succeededTasks.size() + " succeeded maps");
                }
            }

            // Update the last seen event ID
            fromEventId.set(fromEventId.get() + events.length);

            // Process the TaskCompletionEvents:
            // 1. Save the SUCCEEDED maps in knownOutputs to fetch the outputs.
            // 2. Save the OBSOLETE/FAILED/KILLED maps in obsoleteOutputs to stop 
            //    fetching from those maps.
            // 3. Remove TIPFAILED maps from neededOutputs since we don't need their
            //    outputs at all.
            for (TaskCompletionEvent event : events) {
                switch (event.getTaskStatus()) {
                case SUCCEEDED: {
                    URI u = URI.create(event.getTaskTrackerHttp());
                    String host = u.getHost();
                    TaskAttemptID taskAttemptId = event.getTaskAttemptId();
                    succeededAttempts.add(taskAttemptId); // add to collection

                    TaskID coreTaskId = taskAttemptId.getTaskID();
                    if (succeededTasks.contains(coreTaskId)) {
                        //ignore
                        LOG.info("Ignoring succeeded attempt, since we already got success event"
                                + " for this task, new attempt is: '" + taskAttemptId + "'");
                    } else {
                        succeededTasks.add(coreTaskId); // add to collection
                        rdmaChannel.sendFetchReq(host, taskAttemptId.getJobID().toString(),
                                taskAttemptId.toString());
                        numNewMaps++;
                    }
                }
                    break;
                case FAILED:
                case KILLED:
                case OBSOLETE: {

                    TaskAttemptID taskAttemptId = event.getTaskAttemptId();
                    if (succeededAttempts.contains(taskAttemptId)) {
                        //fallback

                        String errorMsg = "encountered obsolete map attempt"
                                + " after this attempt was already successful. TaskStatus=" + event.getTaskStatus()
                                + " new attempt: '" + taskAttemptId + "'";

                        throw new UdaRuntimeException(errorMsg);
                    } else {
                        //ignore
                        LOG.info("Ignoring failed attempt: '" + taskAttemptId + "' with TaskStatus="
                                + event.getTaskStatus() + " that was not reported to C++ before");
                    }

                }
                // break; - break is unreachable after throw
                case TIPFAILED: {
                    //              copiedMapOutputs.add(event.getTaskAttemptId().getTaskID());
                    LOG.info("Ignoring output of failed map TIP: '" + event.getTaskAttemptId() + "'");
                }
                    break;
                }
            }
            return numNewMaps;
        }
    }
}