cascading.platform.hadoop.HadoopPlatform.java Source code

Java tutorial

Introduction

Here is the source code for cascading.platform.hadoop.HadoopPlatform.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.platform.hadoop;

import java.io.File;
import java.io.IOException;
import java.util.Map;

import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.FlowProps;
import cascading.flow.FlowSession;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.flow.hadoop.planner.HadoopPlanner;
import cascading.util.Util;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class HadoopPlatform is automatically loaded and injected into a {@link cascading.PlatformTestCase} instance
 * so that all *PlatformTest classes can be tested against Apache Hadoop.
 * <p/>
 * This platform works in three modes.
 * <p/>
 * Hadoop standalone mode is when Hadoop is NOT run as a cluster, and all
 * child tasks are in process and in memory of the "client" side code.
 * <p/>
 * Hadoop mini cluster mode where a cluster is created on demand using the Hadoop MiniDFSCluster and MiniMRCluster
 * utilities. When a PlatformTestCase requests to use a cluster, this is the default cluster. All properties are
 * pulled from the current CLASSPATH via the JobConf.
 * <p/>
 * Lastly remote cluster mode is enabled when the System property "mapred.jar" is set. This is a Hadoop property
 * specifying the Hadoop "job jar" to be used cluster side. This MUST be the Cascading test suite and dependencies
 * packaged in a Hadoop compatible way. This is left to be implemented by the framework using this mode. Additionally
 * these properties may optionally be set if not already in the CLASSPATH; fs.default.name and mapred.job.tracker.
 */
public class HadoopPlatform extends BaseHadoopPlatform<JobConf> {
    private static final Logger LOG = LoggerFactory.getLogger(HadoopPlatform.class);

    public transient static MiniDFSCluster dfs;
    public transient static MiniMRCluster mr;

    public HadoopPlatform() {
    }

    @Override
    public FlowConnector getFlowConnector(Map<Object, Object> properties) {
        return new HadoopFlowConnector(properties);
    }

    @Override
    public void setNumMapTasks(Map<Object, Object> properties, int numMapTasks) {
        properties.put("mapred.map.tasks", Integer.toString(numMapTasks));
    }

    @Override
    public void setNumReduceTasks(Map<Object, Object> properties, int numReduceTasks) {
        properties.put("mapred.reduce.tasks", Integer.toString(numReduceTasks));
    }

    @Override
    public Integer getNumMapTasks(Map<Object, Object> properties) {
        if (properties.get("mapred.map.tasks") == null)
            return null;

        return Integer.parseInt(properties.get("mapred.map.tasks").toString());
    }

    @Override
    public Integer getNumReduceTasks(Map<Object, Object> properties) {
        if (properties.get("mapred.reduce.tasks") == null)
            return null;

        return Integer.parseInt(properties.get("mapred.reduce.tasks").toString());
    }

    @Override
    public JobConf getConfiguration() {
        return new JobConf(configuration);
    }

    @Override
    public FlowProcess getFlowProcess() {
        return new HadoopFlowProcess(FlowSession.NULL, (JobConf) getConfiguration(), true);
    }

    @Override
    public synchronized void setUp() throws IOException {
        if (configuration != null)
            return;

        if (!isUseCluster()) {
            LOG.info("not using cluster");
            configuration = new JobConf();

            // enforce the local file system in local mode
            configuration.set("fs.default.name", "file:///");
            configuration.set("mapred.job.tracker", "local");
            configuration.set("mapreduce.jobtracker.staging.root.dir",
                    System.getProperty("user.dir") + "/build/tmp/cascading/staging");

            String stagingDir = configuration.get("mapreduce.jobtracker.staging.root.dir");

            if (Util.isEmpty(stagingDir))
                configuration.set("mapreduce.jobtracker.staging.root.dir",
                        System.getProperty("user.dir") + "/build/tmp/cascading/staging");

            fileSys = FileSystem.get(configuration);
        } else {
            LOG.info("using cluster");

            if (Util.isEmpty(System.getProperty("hadoop.log.dir")))
                System.setProperty("hadoop.log.dir", "cascading-hadoop/build/test/log");

            if (Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
                System.setProperty("hadoop.tmp.dir", "cascading-hadoop/build/test/tmp");

            new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored

            JobConf conf = new JobConf();

            if (!Util.isEmpty(System.getProperty("mapred.jar"))) {
                LOG.info("using a remote cluster with jar: {}", System.getProperty("mapred.jar"));
                configuration = conf;

                ((JobConf) configuration).setJar(System.getProperty("mapred.jar"));

                if (!Util.isEmpty(System.getProperty("fs.default.name"))) {
                    LOG.info("using {}={}", "fs.default.name", System.getProperty("fs.default.name"));
                    configuration.set("fs.default.name", System.getProperty("fs.default.name"));
                }

                if (!Util.isEmpty(System.getProperty("mapred.job.tracker"))) {
                    LOG.info("using {}={}", "mapred.job.tracker", System.getProperty("mapred.job.tracker"));
                    configuration.set("mapred.job.tracker", System.getProperty("mapred.job.tracker"));
                }

                configuration.set("mapreduce.user.classpath.first", "true"); // use test dependencies
                fileSys = FileSystem.get(configuration);
            } else {
                dfs = new MiniDFSCluster(conf, 4, true, null);
                fileSys = dfs.getFileSystem();
                mr = new MiniMRCluster(4, fileSys.getUri().toString(), 1, null, null, conf);

                configuration = mr.createJobConf();
            }

            //      jobConf.set( "mapred.map.max.attempts", "1" );
            //      jobConf.set( "mapred.reduce.max.attempts", "1" );
            configuration.set("mapred.child.java.opts", "-Xmx512m");
            configuration.setInt("mapred.job.reuse.jvm.num.tasks", -1);
            configuration.setInt("jobclient.completion.poll.interval", 50);
            configuration.setInt("jobclient.progress.monitor.poll.interval", 50);
            ((JobConf) configuration).setMapSpeculativeExecution(false);
            ((JobConf) configuration).setReduceSpeculativeExecution(false);
        }

        ((JobConf) configuration).setNumMapTasks(numMappers);
        ((JobConf) configuration).setNumReduceTasks(numReducers);

        Map<Object, Object> globalProperties = getGlobalProperties();

        if (logger != null)
            globalProperties.put("log4j.logger", logger);

        FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests

        HadoopPlanner.copyProperties((JobConf) configuration, globalProperties); // copy any external properties

        HadoopPlanner.copyJobConf(properties, (JobConf) configuration); // put all properties on the jobconf
    }
}