org.apache.hadoop.corona.CoronaConf.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.corona.CoronaConf.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.corona;

import java.io.IOException;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;

/**
 * Utility class for corona configuration.
 */
public class CoronaConf extends Configuration {
    /** Logger. */
    public static final Log LOG = LogFactory.getLog(CoronaConf.class);

    /** The includes file. */
    public static final String HOSTS_FILE = "cm.hosts";
    /** The excludes file. */
    public static final String EXCLUDE_HOSTS_FILE = "cm.hosts.exclude";
    /**
     * The name of the file which will contain the CM's state when it goes for
     * an upgrade.
     */
    public static final String CM_STATE_FILE = "cm.state";
    /** The RPC address of the Cluster Manager. */
    public static final String CM_ADDRESS = "cm.server.address";
    /**
     * This boolean property is used to fix whether compression would be used
     * while saving the CM state or not. While debugging, it is preferable
     * that this should be false.
     */
    public static final String CM_COMPRESS_STATE = "cm.compress.state";
    /** The HTTP UI address for the Cluster Manager. */
    public static final String CM_HTTP_ADDRESS = "cm.server.http.address";
    /** The RPC address of the Proxy Job Tracker. */
    public static final String PROXY_JOB_TRACKER_ADDRESS = "corona.proxy.job.tracker.rpcaddr";
    /** The Thrift address of the Proxy Job Tracker. */
    public static final String PROXY_JOB_TRACKER_THRIFT_ADDRESS = "corona.proxy.job.tracker.thriftaddr";
    /** The interval after which a cluster node is timed out. */
    public static final String NODE_EXPIRY_INTERVAL = "cm.node.expiryinterval";
    /** Allow unconfigured pools? */
    public static final String CONFIGURED_POOLS_ONLY = "cm.configured.pools.only";
    /**
     * The number of sessions that flag node for failed connections after which
     * the node is considered bad.
     */
    public static final String NODE_MAX_FAILED_CONNECTIONS = "cm.node.max.failed.connections";
    /**
     * The number of failed connections to a node after which a session flags the
     * node as bad.
     */
    public static final String NODE_MAX_FAILED_CONNECTIONS_SESSION = "cm.node.max.failed.connections.session";
    /**
     * The number of sessions that flag node for failures after which the node is
     * considered bad.
     */
    public static final String NODE_MAX_FAILURES = "cm.node.max.failures";
    /**
     * The number of failures on a node after which a session flags the node as
     * bad.
     */
    public static final String NODE_MAX_FAILURES_SESSION = "cm.node.max.failures.session";
    /** The interval after which a session is timed out. */
    public static final String SESSION_EXPIRY_INTERVAL = "cm.session.expiryinterval";
    public static final String CM_NOTIFIER_THREAD_COUNT = "cm.notifier.numnotifiers";
    /** How often a notifier thread will poll its queue of tasks. */
    public static final String NOTIFIER_POLL_INTERVAL = "cm.notifier.pollinterval";
    /** The retry interval factor for a notifier. */
    public static final String NOTIFIER_RETRY_INTERVAL_FACTOR = "cm.notifier.retry.interval.factor";
    /** The retry interval start for a notifier. */
    public static final String NOTIFIER_RETRY_INTERVAL_START = "cm.notifier.retry.interval.start";
    /** The max retries for a notifier. */
    public static final String NOTIFIER_RETRY_MAX = "cm.notifier.retry.max";
    /** JSON configuration specifying the CPU->Resource allocation. */
    public static final String CPU_TO_RESOURCE_PARTITIONING = "cm.cpu.to.resource.partitioning";
    /** Timeout. */
    public static final String CM_SOTIMEOUT = "cm.server.sotimeout";
    /** Minimum free memory on a node before scheduling on it. */
    public static final String NODE_RESERVED_MEMORY_MB = "cm.node.reserved.memory.mb";
    /** Minimum free disk on a node before scheduling on it. */
    public static final String NODE_RESERVED_DISK_GB = "cm.node.reserved.disk.gb";
    /** Log directory for sessions. */
    public static final String SESSIONS_LOG_ROOT = "corona.sessions.log.dir";
    /** Maximum number of retired sessions to keep in memory. */
    public static final String MAX_RETIRED_SESSIONS = "cm.sessions.num.retired";

    // these are left in the mapred.fairscheduler namespace to make sure they are
    // compatible with the current fairscheduler. client can be expected to send jobs
    // to corona and/or classic hadoop with same configuration
    public static final String IMPLICIT_POOL_PROPERTY = "mapred.fairscheduler.poolnameproperty";
    /**
     * In the format of <pool group>.<pool> (i.e. ads.nonsla)
     * Specifies a default pool group PoolGroupManager.DEFAULT_POOL_GROUP if
     * the pool group is not specified.
     * i.e. ads_nonsla -> defaultpoolgroup.ads_nonsla
     */
    public static final String EXPLICIT_POOL_PROPERTY = "mapred.fairscheduler.pool";

    /** Where the general config file is stored. */
    public static final String CONFIG_FILE_PROPERTY = "cm.config.file";

    /** Default general config file location */
    public static final String DEFAULT_CONFIG_FILE = "corona.xml";

    /** Where the pools config file is stored. */
    public static final String POOLS_CONFIG_FILE_PROPERTY = "cm.pools.config.file";

    /**
     * Default pools config file location (same as general config file
     * by default).
     */
    public static final String DEFAULT_POOLS_CONFIG_FILE = "corona.xml";

    /**
     * Property for specifying the number of ms to wait between pools config
     * generation (if specified).
     */
    public static final String POOLS_RELOAD_PERIOD_MS_PROPERTY = "cm.pools.reload.period.ms";

    /**
     * Property for specifying the number of ms to wait between pools config
     * generation (if specified).
     */
    public static final String CONFIG_RELOAD_PERIOD_MS_PROPERTY = "cm.config.reload.period.ms";

    /** Class to generate the pools config */
    public static final String POOLS_CONFIG_DOCUMENT_GENERATOR_PROPERTY = "cm.pools.config.document.generator";

    /** number of task trackers restarted in one batch */
    public static final String CORONA_NODE_RESTART_BATCH = "corona.node.restart.batch";

    /** interval for restarting task trackers batches */
    public static final String CORONA_NODE_RESTART_INTERVAL = "corona.node.restart.interval";

    /** The max time CM will wait for JT heartbeat to be in sync */
    public static final String CM_HEARTBEAT_DELAY_MAX = "cm.heartbeat.delay.max";

    private Map<Integer, Map<ResourceType, Integer>> cachedCpuToResourcePartitioning = null;

    public CoronaConf(Configuration conf) {
        super(conf);
    }

    public int getCMSoTimeout() {
        return getInt(CM_SOTIMEOUT, 60 * 1000);
    }

    public String getClusterManagerAddress() {
        return get(CM_ADDRESS, "localhost:8888");
    }

    public String getClusterManagerHttpAddress() {
        return get(CM_HTTP_ADDRESS, "localhost:0");
    }

    public String getProxyJobTrackerAddress() {
        return get(PROXY_JOB_TRACKER_ADDRESS, "localhost:50035");
    }

    public String getProxyJobTrackerThriftAddress() {
        return get(PROXY_JOB_TRACKER_THRIFT_ADDRESS, "localhost:50036");
    }

    public static String getClusterManagerAddress(Configuration conf) {
        return conf.get(CM_ADDRESS, "localhost:8888");
    }

    public int getNodeExpiryInterval() {
        return getInt(NODE_EXPIRY_INTERVAL, 10 * 60 * 1000);
    }

    public String getSessionsLogDir() {
        return get(SESSIONS_LOG_ROOT, "/tmp/history");
    }

    public int getNumRetiredSessions() {
        return getInt(MAX_RETIRED_SESSIONS, 1000);
    }

    public int getMaxSessionsPerDir() {
        return getInt("corona.history.max.per.dir", 1000);
    }

    public long getLogDirRotationInterval() {
        return getLong("corona.history.roll.period", 60L * 60 * 1000);
    }

    public int getSessionExpiryInterval() {
        int val = getInt(SESSION_EXPIRY_INTERVAL, 0);

        if (val != 0)
            return val;

        // if the session expiry interval is not specified then we compute
        // one based on the exponential backoff intervals of the session
        // notification retries

        val = getNotifierRetryIntervalStart();
        int factor = getNotifierRetryIntervalFactor();
        for (int i = 1; i < getNotifierRetryMax(); i++) {
            val += val * factor;
        }
        return val;
    }

    public int getNotifierPollInterval() {
        return getInt(NOTIFIER_POLL_INTERVAL, 1000);
    }

    public int getNotifierRetryIntervalFactor() {
        return getInt(NOTIFIER_RETRY_INTERVAL_FACTOR, 4);
    }

    public int getNotifierRetryIntervalStart() {
        return getInt(NOTIFIER_RETRY_INTERVAL_START, 5000);
    }

    public int getNotifierRetryMax() {
        return getInt(NOTIFIER_RETRY_MAX, 5);
    }

    /**
     * Get and cache the cpu to resource partitioning for this object.
     *
     * @return Mapping of cpu to resources (cached)
     */
    public Map<Integer, Map<ResourceType, Integer>> getCpuToResourcePartitioning() {
        if (cachedCpuToResourcePartitioning == null) {
            cachedCpuToResourcePartitioning = getUncachedCpuToResourcePartitioning(this);
        }
        return cachedCpuToResourcePartitioning;
    }

    /**
     * Determine the cpu to resource partitioning for a configuration
     *
     * @param conf Configuration with the cpu to resource partitioning
     * @return Mapping of cpu to resources
     */
    public static Map<Integer, Map<ResourceType, Integer>> getUncachedCpuToResourcePartitioning(
            Configuration conf) {
        String jsonStr = conf.get(CPU_TO_RESOURCE_PARTITIONING, "");
        Map<Integer, Map<ResourceType, Integer>> ret = new HashMap<Integer, Map<ResourceType, Integer>>();

        try {
            ObjectMapper mapper = new ObjectMapper();
            JsonNode rootNode = mapper.readValue(jsonStr, JsonNode.class);

            Iterator<String> iter = rootNode.getFieldNames();
            while (iter.hasNext()) {
                String field = iter.next();
                Integer numCpu = Integer.parseInt(field);

                if ((numCpu < 0) || (numCpu > 64)) {
                    throw new RuntimeException("Number of CPUs: " + numCpu + " is not in range 0-64");
                }

                JsonNode val = rootNode.get(field);
                if (!val.isObject()) {
                    throw new RuntimeException("Resource Partitioning: " + val.toString() + " is not a object");
                }

                Map<ResourceType, Integer> resourcePartition = null;

                Iterator<String> valIter = val.getFieldNames();
                while (valIter.hasNext()) {
                    String resourceTypeString = valIter.next();
                    JsonNode resourceVal = val.get(resourceTypeString);
                    int resourceSlots = 0;

                    if (!resourceVal.isInt() || ((resourceSlots = resourceVal.getIntValue()) < 0)
                            || resourceSlots > 64) {
                        throw new RuntimeException(
                                "Resource Partition value: " + resourceVal.toString() + " is not a valid number");
                    }
                    if (resourcePartition == null) {
                        resourcePartition = new EnumMap<ResourceType, Integer>(ResourceType.class);
                    }

                    try {
                        ResourceType resourceType = ResourceType.valueOf(resourceTypeString);
                        resourcePartition.put(resourceType, new Integer(resourceSlots));
                    } catch (IllegalArgumentException e) {
                        throw new IllegalArgumentException(
                                "Cannot correctly parse resource type " + resourceTypeString + ", must be one of "
                                        + Arrays.toString(ResourceType.values()));
                    }
                }

                if (resourcePartition != null) {
                    ret.put(numCpu, resourcePartition);
                }
            }

            return ret;

        } catch (JsonParseException e) {
            LOG.error(jsonStr + " is not a valid value for option: " + CPU_TO_RESOURCE_PARTITIONING);
            throw new RuntimeException(e);
        } catch (JsonMappingException e) {
            LOG.error(jsonStr + " is not a valid value for option: " + CPU_TO_RESOURCE_PARTITIONING);
            throw new RuntimeException(e);
        } catch (IOException e) {
            LOG.error(jsonStr + " is not a valid value for option: " + CPU_TO_RESOURCE_PARTITIONING);
            throw new RuntimeException(e);
        }
    }

    /**
     * Get the pool info.  In order to support previous behavior, a single pool
     * name is accepted.
     * @return Pool info, using a default pool group if the
     *         explicit pool can not be found
     */
    public PoolInfo getPoolInfo() {
        String poolNameProperty = get(IMPLICIT_POOL_PROPERTY, "user.name");
        String explicitPool = get(EXPLICIT_POOL_PROPERTY, get(poolNameProperty, "")).trim();
        String[] poolInfoSplitString = explicitPool.split("[.]");
        if (poolInfoSplitString != null && poolInfoSplitString.length == 2) {
            return new PoolInfo(poolInfoSplitString[0], poolInfoSplitString[1]);
        } else if (!explicitPool.isEmpty()) {
            return new PoolInfo(PoolGroupManager.DEFAULT_POOL_GROUP, explicitPool);
        } else {
            return PoolGroupManager.DEFAULT_POOL_INFO;
        }
    }

    public int getNodeReservedMemoryMB() {
        return getInt(NODE_RESERVED_MEMORY_MB, 0);
    }

    public int getNodeReservedDiskGB() {
        return getInt(NODE_RESERVED_DISK_GB, 0);
    }

    /**
     * @return The number of sessions that report too many failed connections in
     *         order to blacklist a node.
     */
    public int getMaxFailedConnections() {
        return getInt(NODE_MAX_FAILED_CONNECTIONS, 20);
    }

    /**
     * @return The number of failed connections to a node encountered by a session
     *         in order for it to count towards blacklisting the node.
     */
    public int getMaxFailedConnectionsPerSession() {
        return getInt(NODE_MAX_FAILED_CONNECTIONS_SESSION, 1);
    }

    /**
     * @return The number of sessions that report too many failures in order to
     *         blacklist a node.
     */
    public int getMaxFailures() {
        return getInt(NODE_MAX_FAILURES, 40);
    }

    /**
     * @return The number of failures encountered by a session in order for it to
     *         count towards blacklisting the node.
     */
    public int getMaxFailuresPerSession() {
        return getInt(NODE_MAX_FAILURES_SESSION, 5);
    }

    public String getHostsFile() {
        return get(HOSTS_FILE, "");
    }

    public String getExcludesFile() {
        return get(EXCLUDE_HOSTS_FILE, "");
    }

    /**
     * Get the address of the file used to save the state of the ClusterManager
     * when it goes down for an upgrade
     *
     * @return A String, containing the address of the file used to save the
     *          ClusterManager state.
     */
    public String getCMStateFile() {
        return get(CM_STATE_FILE, "cm.state");
    }

    /**
     * Return the flag which indicates if we will be using compression while
     * saving the ClusterManager state.
     *
     * @return A boolean, which is true if we are going to use compression while
     *          saving the CM state.
     */
    public boolean getCMCompressStateFlag() {
        return getBoolean(CM_COMPRESS_STATE, false);
    }

    public int getCMNotifierThreadCount() {
        return getInt(CM_NOTIFIER_THREAD_COUNT, 17);
    }

    /**
     * Get the general config file location
     *
     * @return General config file location (default if not set)
     */
    public String getConfigFile() {
        return get(CONFIG_FILE_PROPERTY, DEFAULT_CONFIG_FILE);
    }

    /**
     * Get the pools config file location
     *
     * @return Pools config file location (default if not set)
     */
    public String getPoolsConfigFile() {
        return get(POOLS_CONFIG_FILE_PROPERTY, DEFAULT_POOLS_CONFIG_FILE);
    }

    /**
     * Only allow configured pools?
     *
     * @return True if only configured pools is allowed, false otherwise
     */
    public boolean onlyAllowConfiguredPools() {
        return getBoolean(CONFIGURED_POOLS_ONLY, false);
    }

    /**
     * Get the milliseconds to wait between trying to generate pools config
     *
     * @return Milliseconds to wait between trying to generate pools config
     */
    public long getPoolsReloadPeriodMs() {
        // Default of 5 minutes
        return getLong(POOLS_RELOAD_PERIOD_MS_PROPERTY, 5 * 60000);
    }

    /**
     * Get the milliseconds to wait between reloading config files
     *
     * @return Milliseconds to wait between reloading config files
     */
    public long getConfigReloadPeriodMs() {
        // Default of 1 minute
        return getLong(CONFIG_RELOAD_PERIOD_MS_PROPERTY, 60000);
    }

    /**
     * Get the pools config document generator class
     *
     * @return Null if not specified, otherwise the generator class.
     */
    public Class<?> getPoolsConfigDocumentGeneratorClass() {
        return getClass(POOLS_CONFIG_DOCUMENT_GENERATOR_PROPERTY, null);
    }

    public int getCoronaNodeRestartBatch() {
        return getInt(CORONA_NODE_RESTART_BATCH, 1000);
    }

    public long getCoronaNodeRestartInterval() {
        return getLong(CORONA_NODE_RESTART_INTERVAL, 1800000L);
    }

    public long getCMHeartbeatDelayMax() {
        return getLong(CM_HEARTBEAT_DELAY_MAX, 600000);
    }
}