com.scaleunlimited.cascading.hadoop.HadoopUtils.java Source code

Introduction

Here is the source code for com.scaleunlimited.cascading.hadoop.HadoopUtils.java
Source

/**
 * Copyright 2010-2011 TransPac Software, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.scaleunlimited.cascading.hadoop;

import java.io.IOException;
import java.util.Map;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobTracker.State;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cascading.flow.FlowProcess;
import cascading.flow.FlowProcessWrapper;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.property.AppProps;

import com.scaleunlimited.cascading.Level;
import com.scaleunlimited.cascading.LoggingFlowProcess;

public class HadoopUtils {
    private static final Logger LOGGER = LoggerFactory.getLogger(HadoopUtils.class);

    private static final long STATUS_CHECK_INTERVAL = 10000;

    public static void safeRemove(FileSystem fs, Path path) {
        if ((fs != null) && (path != null)) {
            try {
                fs.delete(path, true);
            } catch (Throwable t) {
                // Ignore
            }
        }
    }

    /**
     * Return the number of reducers, and thus the max number of parallel reduce tasks.
     * 
     * @param conf
     * @return number of reducers
     * @throws IOException
     * @throws InterruptedException
     */
    public static int getNumReducers(JobConf conf) throws IOException, InterruptedException {
        // TODO the call to getMaxReduceTasks always returns 1 in MR2.
        ClusterStatus status = safeGetClusterStatus(conf);
        return status.getMaxReduceTasks();
    }

    public static int getTaskTrackers(JobConf conf) throws IOException, InterruptedException {
        ClusterStatus status = safeGetClusterStatus(conf);
        return status.getTaskTrackers();
    }

    public static JobConf getDefaultJobConf() throws IOException, InterruptedException {
        JobConf conf = new JobConf();

        // We explicitly set task counts to 1 for local so that code which depends on
        // things like the reducer count runs properly.
        if (isJobLocal(conf)) {
            conf.setNumMapTasks(1);
            conf.setNumReduceTasks(1);
        } else {
            conf.setNumReduceTasks(getNumReducers(conf));

            // TODO - By default we want to use 0.95 * the number of reduce slots, as per
            // Hadoop wiki. But we want to round, versus truncate, to avoid setting it to
            // 0 if we have one reducer. This way it only impacts you if you have more
            // than 10 reducers.
            // conf.setNumReduceTasks((getNumReducers(conf) * 95) / 100);
        }

        conf.setMapSpeculativeExecution(false);
        conf.setReduceSpeculativeExecution(false);

        return conf;
    }

    public static void setLoggingProperties(Properties props, Level cascadingLevel, Level bixoLevel) {
        props.put("log4j.logger", String.format("cascading=%s,bixo=%s", cascadingLevel, bixoLevel));
    }

    public static Map<Object, Object> getDefaultProperties(Class appJarClass, boolean debugging, JobConf conf) {
        Map<Object, Object> properties = HadoopUtil.createProperties(conf);

        // Use special Cascading hack to control logging levels for code running as Hadoop jobs
        if (debugging) {
            properties.put("log4j.logger", "cascading=DEBUG,bixo=TRACE");
        } else {
            properties.put("log4j.logger", "cascading=INFO,bixo=INFO");
        }

        AppProps.setApplicationJarClass(properties, appJarClass);

        return properties;
    }

    public static boolean isJobLocal(JobConf conf) {
        return isConfigLocal(conf);
    }

    public static boolean isConfigLocal(Configuration conf) {
        // First see if we have the new MR2 setting
        String hostname = conf.get("yarn.resourcemanager.hostname");
        if (hostname != null) {
            return hostname.equals("0.0.0.0");
        } else {
            // MR1 approach
            return conf.get("mapred.job.tracker").equalsIgnoreCase("local");
        }
    }

    @SuppressWarnings("rawtypes")
    public static boolean isHadoopFlowProcess(FlowProcess fp) {
        return (undelegate(fp) instanceof HadoopFlowProcess);
    }

    @SuppressWarnings("rawtypes")
    public static FlowProcess undelegate(FlowProcess fp) {
        FlowProcess delegate = fp;
        if (delegate instanceof LoggingFlowProcess) {
            delegate = ((LoggingFlowProcess) delegate).getDelegate();
        }
        int delegateNestingLevel = 0;
        while (delegate instanceof FlowProcessWrapper) {
            if (++delegateNestingLevel > 100) {
                throw new RuntimeException("FlowProcessWrapper seems to have circular nesting references");
            }
            delegate = FlowProcessWrapper.undelegate(delegate);
        }
        return delegate;
    }

    /**
     * Utility routine that tries to ensure the cluster is "stable" (slaves have reported in) so
     * that it's safe to call things like maxReduceTasks.
     * 
     * @param conf
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @SuppressWarnings("deprecation")
    private static ClusterStatus safeGetClusterStatus(JobConf conf) throws IOException, InterruptedException {
        JobClient jobClient = new JobClient(conf);
        int numTaskTrackers = -1;

        while (true) {
            ClusterStatus status = jobClient.getClusterStatus();
            // TODO there isn't a "job tracker" in MR2, just a resource manager, and a transient
            // application manager for running a Hadoop job.
            if (status.getJobTrackerState() == State.RUNNING) {
                int curTaskTrackers = status.getTaskTrackers();
                if (curTaskTrackers == numTaskTrackers) {
                    return status;
                } else {
                    // Things are still settling down, so keep looping.
                    if (numTaskTrackers != -1) {
                        LOGGER.trace(String.format("Got incremental update to number of task trackers (%d to %d)",
                                numTaskTrackers, curTaskTrackers));
                    }

                    numTaskTrackers = curTaskTrackers;
                }
            }

            if (!isJobLocal(conf)) {
                LOGGER.trace("Sleeping during status check");
                Thread.sleep(STATUS_CHECK_INTERVAL);
            }
        }
    }

}