org.apache.tez.mapreduce.task.MRRuntimeTask.java Source code

Introduction

Here is the source code for org.apache.tez.mapreduce.task.MRRuntimeTask.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.mapreduce.task;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

import javax.crypto.SecretKey;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSError;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapOutputFile;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.StringUtils;
import org.apache.tez.common.TezEngineTaskContext;
import org.apache.tez.common.TezJobConfig;
import org.apache.tez.common.TezTaskUmbilicalProtocol;
import org.apache.tez.engine.api.Input;
import org.apache.tez.engine.api.Master;
import org.apache.tez.engine.api.Output;
import org.apache.tez.engine.api.Processor;
import org.apache.tez.engine.common.security.JobTokenIdentifier;
import org.apache.tez.engine.common.security.TokenCache;
import org.apache.tez.engine.task.RuntimeTask;
import org.apache.tez.mapreduce.hadoop.DeprecatedKeys;
import org.apache.tez.mapreduce.hadoop.MRHelpers;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.mapreduce.hadoop.MultiStageMRConfigUtil;
import org.apache.tez.mapreduce.processor.MRTask;
import org.apache.tez.mapreduce.task.impl.YarnOutputFiles;

@SuppressWarnings("deprecation")
public class MRRuntimeTask extends RuntimeTask {

    private static final Log LOG = LogFactory.getLog(MRRuntimeTask.class);

    private MRTask mrTask;

    public MRRuntimeTask(TezEngineTaskContext taskContext, Processor processor, Input[] inputs, Output[] outputs) {
        super(taskContext, processor, inputs, outputs);
    }

    @Override
    public void initialize(Configuration conf, byte[] userPayload, Master master)
            throws IOException, InterruptedException {

        DeprecatedKeys.init();

        Configuration taskConf = null;

        if (userPayload == null) {
            // Fall back to using job.xml
            Configuration mrConf = new Configuration(conf);
            mrConf.addResource(MRJobConfig.JOB_CONF_FILE);
            taskConf = MultiStageMRConfigUtil.getConfForVertex(mrConf, taskContext.getVertexName());
        } else {
            taskConf = MRHelpers.createConfFromUserPayload(userPayload);
            copyTezConfigParameters(taskConf, conf);
        }

        // TODO Avoid all this extra config manipulation.
        // FIXME we need I/O/p level configs to be used in init below

        // TODO Post MRR
        // A single file per vertex will likely be a better solution. Does not
        // require translation - client can take care of this. Will work independent
        // of whether the configuration is for intermediate tasks or not. Has the
        // overhead of localizing multiple files per job - i.e. the client would
        // need to write these files to hdfs, add them as local resources per
        // vertex. A solution like this may be more practical once it's possible to
        // submit configuration parameters to the AM and effectively tasks via RPC.

        final JobConf job = new JobConf(taskConf);
        job.set(MRJobConfig.VERTEX_NAME, taskContext.getVertexName());

        MRTask mrTask = (MRTask) getProcessor();
        this.mrTask = mrTask;
        configureMRTask(job, mrTask);

        this.conf = job;
        this.master = master;

        // NOTE: Allow processor to initialize input/output
        processor.initialize(this.conf, this.master);
    }

    /*
     * Used when creating a conf from the userPayload. Need to copy all the tez
     * config parameters which are set by YarnTezDagChild
     */
    public static void copyTezConfigParameters(Configuration conf, Configuration tezTaskConf) {
        Iterator<Entry<String, String>> iter = tezTaskConf.iterator();
        while (iter.hasNext()) {
            Entry<String, String> entry = iter.next();
            if (conf.get(entry.getKey()) == null) {
                conf.set(entry.getKey(), tezTaskConf.get(entry.getKey()));
            }
        }
    }

    @Override
    public void run() throws IOException, InterruptedException {
        TezTaskUmbilicalProtocol umbilical = (TezTaskUmbilicalProtocol) master;
        try {
            super.run();
        } catch (FSError e) {
            throw e;
        } catch (Exception exception) {
            LOG.warn("Exception running child : " + StringUtils.stringifyException(exception));
            try {
                if (mrTask != null) {
                    mrTask.taskCleanup(umbilical);
                }
            } catch (Exception e) {
                LOG.info("Exception cleanup up: " + StringUtils.stringifyException(e));
            }
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            exception.printStackTrace(new PrintStream(baos));
            if (taskContext.getTaskAttemptId() != null) {
                umbilical.fatalError(taskContext.getTaskAttemptId(), baos.toString());
            }
        }
    }

    @Override
    public void close() throws IOException, InterruptedException {
        // NOTE: Allow processor to close input/output
        processor.close();
    }

    private static void configureMRTask(JobConf job, MRTask task) throws IOException, InterruptedException {

        Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
        job.setCredentials(credentials);
        // TODO Can this be avoided all together. Have the MRTezOutputCommitter use
        // the Tez parameter.
        // TODO This could be fetched from the env if YARN is setting it for all
        // Containers.
        // Set it in conf, so as to be able to be used the the OutputCommitter.
        job.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, job.getInt(TezJobConfig.APPLICATION_ATTEMPT_ID, -1));

        job.setClass(MRConfig.TASK_LOCAL_OUTPUT_CLASS, YarnOutputFiles.class, MapOutputFile.class); // MR

        Token<JobTokenIdentifier> jobToken = TokenCache.getJobToken(credentials);
        if (jobToken != null) {
            // Will MR ever run without a job token.
            SecretKey sk = JobTokenSecretManager.createSecretKey(jobToken.getPassword());
            task.setJobTokenSecret(sk);
        } else {
            LOG.warn("No job token set");
        }

        job.set(MRJobConfig.JOB_LOCAL_DIR, job.get(TezJobConfig.JOB_LOCAL_DIR));
        job.set(MRConfig.LOCAL_DIR, job.get(TezJobConfig.LOCAL_DIRS));
        if (job.get(TezJobConfig.DAG_CREDENTIALS_BINARY) != null) {
            job.set(MRJobConfig.MAPREDUCE_JOB_CREDENTIALS_BINARY, job.get(TezJobConfig.DAG_CREDENTIALS_BINARY));
        }

        // setup the child's attempt directories
        // Do the task-type specific localization
        task.localizeConfiguration(job);

        // Set up the DistributedCache related configs
        setupDistributedCacheConfig(job);

        task.setConf(job);
    }

    /**
     * Set up the DistributedCache related configs to make
     * {@link DistributedCache#getLocalCacheFiles(Configuration)} and
     * {@link DistributedCache#getLocalCacheArchives(Configuration)} working.
     * 
     * @param job
     * @throws IOException
     */
    private static void setupDistributedCacheConfig(final JobConf job) throws IOException {

        String localWorkDir = (job.get(TezJobConfig.TASK_LOCAL_RESOURCE_DIR));
        // ^ ^ all symlinks are created in the current work-dir

        // Update the configuration object with localized archives.
        URI[] cacheArchives = DistributedCache.getCacheArchives(job);
        if (cacheArchives != null) {
            List<String> localArchives = new ArrayList<String>();
            for (int i = 0; i < cacheArchives.length; ++i) {
                URI u = cacheArchives[i];
                Path p = new Path(u);
                Path name = new Path((null == u.getFragment()) ? p.getName() : u.getFragment());
                String linkName = name.toUri().getPath();
                localArchives.add(new Path(localWorkDir, linkName).toUri().getPath());
            }
            if (!localArchives.isEmpty()) {
                job.set(MRJobConfig.CACHE_LOCALARCHIVES,
                        StringUtils.arrayToString(localArchives.toArray(new String[localArchives.size()])));
            }
        }

        // Update the configuration object with localized files.
        URI[] cacheFiles = DistributedCache.getCacheFiles(job);
        if (cacheFiles != null) {
            List<String> localFiles = new ArrayList<String>();
            for (int i = 0; i < cacheFiles.length; ++i) {
                URI u = cacheFiles[i];
                Path p = new Path(u);
                Path name = new Path((null == u.getFragment()) ? p.getName() : u.getFragment());
                String linkName = name.toUri().getPath();
                localFiles.add(new Path(localWorkDir, linkName).toUri().getPath());
            }
            if (!localFiles.isEmpty()) {
                job.set(MRJobConfig.CACHE_LOCALFILES,
                        StringUtils.arrayToString(localFiles.toArray(new String[localFiles.size()])));
            }
        }
    }

}