cascading.platform.tez.Hadoop2TezPlatform.java Source code

Java tutorial

Introduction

Here is the source code for cascading.platform.tez.Hadoop2TezPlatform.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.platform.tez;

import java.io.File;
import java.io.IOException;
import java.security.Permission;
import java.util.Map;

import cascading.CascadingException;
import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.FlowProps;
import cascading.flow.FlowRuntimeProps;
import cascading.flow.FlowSession;
import cascading.flow.tez.Hadoop2TezFlowConnector;
import cascading.flow.tez.Hadoop2TezFlowProcess;
import cascading.flow.tez.planner.Hadoop2TezPlanner;
import cascading.platform.hadoop.BaseHadoopPlatform;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.applicationhistoryservice.ApplicationHistoryServer;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.history.logging.ats.ATSHistoryLoggingService;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.test.MiniTezCluster;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class Hadoop2Platform is automatically loaded and injected into a {@link cascading.PlatformTestCase} instance
 * so that all *PlatformTest classes can be tested against Apache Hadoop 2.x.
 */
public class Hadoop2TezPlatform extends BaseHadoopPlatform<TezConfiguration> {
    private static final Logger LOG = LoggerFactory.getLogger(Hadoop2TezPlatform.class);
    private transient static MiniDFSCluster miniDFSCluster;
    private transient static MiniTezCluster miniTezCluster;
    private transient static SecurityManager securityManager;
    private transient ApplicationHistoryServer yarnHistoryServer;

    public Hadoop2TezPlatform() {
        this.numGatherPartitions = 1;
    }

    @Override
    public String getName() {
        return "hadoop2-tez";
    }

    @Override
    public FlowConnector getFlowConnector(Map<Object, Object> properties) {
        return new Hadoop2TezFlowConnector(properties);
    }

    @Override
    public void setNumGatherPartitionTasks(Map<Object, Object> properties, int numGatherPartitions) {
        properties.put(FlowRuntimeProps.GATHER_PARTITIONS, Integer.toString(numGatherPartitions));
    }

    @Override
    public Integer getNumGatherPartitionTasks(Map<Object, Object> properties) {
        if (properties.get(FlowRuntimeProps.GATHER_PARTITIONS) == null)
            return null;

        return Integer.parseInt(properties.get(FlowRuntimeProps.GATHER_PARTITIONS).toString());
    }

    public TezConfiguration getConfiguration() {
        return new TezConfiguration(configuration);
    }

    @Override
    public FlowProcess getFlowProcess() {
        return new Hadoop2TezFlowProcess(FlowSession.NULL, null, getConfiguration());
    }

    @Override
    public boolean isMapReduce() {
        return false;
    }

    @Override
    public boolean isDAG() {
        return true;
    }

    @Override
    public synchronized void setUp() throws IOException {
        if (configuration != null)
            return;

        if (!isUseCluster()) {
            // Current usage requirements:
            // 1. Clients need to set "tez.local.mode" to true when creating a TezClient instance. (For the examples this can be done via -Dtez.local.mode=true)
            // 2. fs.defaultFS must be set to "file:///"
            // 2.1 If running examples - this must be set in tez-site.xml (so that it's picked up by the client, as well as the conf instances used to configure the Inputs / Outputs).
            // 2.2 If using programatically (without a tez-site.xml present). All configuration instances used (to crate the client / configure Inputs / Outputs) - must have this property set.
            // 3. tez.runtime.optimize.local.fetch needs to be set to true (either via tez-site.xml or in all configurations used to create the job (similar to fs.defaultFS in step 2))
            // 4. tez.staging-dir must be set (either programatically or via tez-site.xml).
            // Until TEZ-1337 goes in - the staging-dir for the job is effectively the root of the filesystem (and where inputs are read from / written to if relative paths are used).

            LOG.info("not using cluster");
            configuration = new Configuration();

            configuration.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions());
            //      configuration.setInt( FlowRuntimeProps.GATHER_PARTITIONS, 1 ); // deadlocks if larger than 1

            configuration.set(TezConfiguration.TEZ_LOCAL_MODE, "true");
            configuration.set("fs.defaultFS", "file:///");
            configuration.set("tez.runtime.optimize.local.fetch", "true");

            // hack to prevent deadlocks where downstream processors are scheduled before upstream
            configuration.setInt("tez.am.inline.task.execution.max-tasks", 3); // testHashJoinMergeIntoHashJoinAccumulatedAccumulatedMerge fails if set to 2

            configuration.set(TezConfiguration.TEZ_IGNORE_LIB_URIS, "true"); // in local mode, use local classpath
            configuration.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1);
            configuration.set(TezConfiguration.TEZ_GENERATE_DEBUG_ARTIFACTS, "true");

            configuration.set("tez.am.mode.session", "true"); // allows multiple TezClient instances to be used in a single jvm

            if (!Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
                configuration.set("hadoop.tmp.dir", System.getProperty("hadoop.tmp.dir"));
            else
                configuration.set("hadoop.tmp.dir", "build/test/tmp");

            fileSys = FileSystem.get(configuration);
        } else {
            LOG.info("using cluster");

            if (Util.isEmpty(System.getProperty("hadoop.log.dir")))
                System.setProperty("hadoop.log.dir", "build/test/log");

            if (Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
                System.setProperty("hadoop.tmp.dir", "build/test/tmp");

            new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored
            new File(System.getProperty("hadoop.tmp.dir")).mkdirs(); // ignored

            Configuration defaultConf = new Configuration();

            defaultConf.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions());

            defaultConf.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1);

            //      defaultConf.set( TezConfiguration.TEZ_AM_LOG_LEVEL, "DEBUG" );
            //      defaultConf.set( TezConfiguration.TEZ_TASK_LOG_LEVEL, "DEBUG" );

            defaultConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
            defaultConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
            defaultConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, System.getProperty("hadoop.tmp.dir"));

            miniDFSCluster = new MiniDFSCluster.Builder(defaultConf).numDataNodes(4).format(true).racks(null)
                    .build();

            fileSys = miniDFSCluster.getFileSystem();

            Configuration tezConf = new Configuration(defaultConf);
            tezConf.set("fs.defaultFS", fileSys.getUri().toString()); // use HDFS
            tezConf.set(MRJobConfig.MR_AM_STAGING_DIR, "/apps_staging_dir");

            // see MiniTezClusterWithTimeline as alternate
            miniTezCluster = new MiniTezCluster(getClass().getName(), 4, 1, 1); // todo: set to 4
            miniTezCluster.init(tezConf);
            miniTezCluster.start();

            configuration = miniTezCluster.getConfig();

            // stats won't work after completion unless ATS is used
            if (setTimelineStore(configuration)) // true if ats can be loaded and configured for this hadoop version
            {
                configuration.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS,
                        ATSHistoryLoggingService.class.getName());
                configuration.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
                configuration.set(YarnConfiguration.TIMELINE_SERVICE_ADDRESS, "localhost:10200");
                configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS, "localhost:8188");
                configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_HTTPS_ADDRESS, "localhost:8190");

                yarnHistoryServer = new ApplicationHistoryServer();
                yarnHistoryServer.init(configuration);
                yarnHistoryServer.start();
            }
        }

        configuration.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, 1);
        configuration.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1);
        configuration.setInt(TezConfiguration.TEZ_AM_MAX_TASK_FAILURES_PER_NODE, 1);

        Map<Object, Object> globalProperties = getGlobalProperties();

        if (logger != null)
            globalProperties.put("log4j.logger", logger);

        FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests

        Hadoop2TezPlanner.copyProperties(configuration, globalProperties); // copy any external properties

        Hadoop2TezPlanner.copyConfiguration(properties, configuration); // put all properties on the jobconf

        ExitUtil.disableSystemExit();

        //    forbidSystemExitCall();
    }

    protected boolean setTimelineStore(Configuration configuration) {
        try {
            // try hadoop 2.6
            Class<?> target = Util.loadClass("org.apache.hadoop.yarn.server.timeline.TimelineStore");
            Class<?> type = Util.loadClass("org.apache.hadoop.yarn.server.timeline.MemoryTimelineStore");

            configuration.setClass(YarnConfiguration.TIMELINE_SERVICE_STORE, type, target);

            try {
                // hadoop 2.5 has the above classes, but this one is also necessary for the timeline service with acls to function.
                Util.loadClass("org.apache.hadoop.yarn.api.records.timeline.TimelineDomain");
            } catch (CascadingException exception) {
                configuration.setBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS, true);
            }

            return true;
        } catch (CascadingException exception) {
            try {
                // try hadoop 2.4
                Class<?> target = Util.loadClass(
                        "org.apache.hadoop.yarn.server.applicationhistoryservice.timeline.TimelineStore");
                Class<?> type = Util.loadClass(
                        "org.apache.hadoop.yarn.server.applicationhistoryservice.timeline.MemoryTimelineStore");

                configuration.setClass(YarnConfiguration.TIMELINE_SERVICE_STORE, type, target);
                configuration.setBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS, true);

                return true;
            } catch (CascadingException ignore) {
                return false;
            }
        }
    }

    private static class ExitTrappedException extends SecurityException {
    }

    private static void forbidSystemExitCall() {
        if (securityManager != null)
            return;

        securityManager = new SecurityManager() {
            public void checkPermission(Permission permission) {
                if (!"exitVM".equals(permission.getName()))
                    return;

                StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();

                for (StackTraceElement stackTraceElement : stackTrace)
                    LOG.warn("exit vm trace: {}", stackTraceElement);

                throw new ExitTrappedException();
            }
        };

        System.setSecurityManager(securityManager);
    }

    private static void enableSystemExitCall() {
        securityManager = null;
        System.setSecurityManager(null);
    }
}