org.apache.tez.analyzer.plugins.SlowNodeAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tez.analyzer.plugins.SlowNodeAnalyzer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.analyzer.plugins;

import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;
import org.apache.tez.analyzer.Analyzer;
import org.apache.tez.analyzer.CSVResult;
import org.apache.tez.common.counters.FileSystemCounter;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.common.counters.TezCounter;
import org.apache.tez.common.counters.TezCounters;
import org.apache.tez.dag.api.TezException;
import org.apache.tez.dag.api.oldrecords.TaskAttemptState;
import org.apache.tez.history.parser.datamodel.DagInfo;
import org.apache.tez.history.parser.datamodel.TaskAttemptInfo;

import java.util.Collection;
import java.util.List;

/**
 * This will provide the set of nodes participated in the DAG in descending order of task execution
 * time.
 * <p/>
 * Combine it with other counters to understand slow nodes better.
 */
public class SlowNodeAnalyzer extends TezAnalyzerBase implements Analyzer {

    private static final Log LOG = LogFactory.getLog(SlowNodeAnalyzer.class);

    private static final String[] headers = { "nodeName", "noOfTasksExecuted", "noOfKilledTasks", "noOfFailedTasks",
            "avgSucceededTaskExecutionTime", "avgKilledTaskExecutionTime", "avgFailedTaskExecutionTime",
            "avgHDFSBytesRead", "avgHDFSBytesWritten", "avgFileBytesRead", "avgFileBytesWritten", "avgGCTimeMillis",
            "avgCPUTimeMillis" };

    private final CSVResult csvResult = new CSVResult(headers);

    private final Configuration config;

    public SlowNodeAnalyzer(Configuration config) {
        this.config = config;
    }

    @Override
    public void analyze(DagInfo dagInfo) throws TezException {
        Multimap<String, TaskAttemptInfo> nodeDetails = dagInfo.getNodeDetails();
        for (String nodeName : nodeDetails.keySet()) {
            List<String> record = Lists.newLinkedList();

            Collection<TaskAttemptInfo> taskAttemptInfos = nodeDetails.get(nodeName);

            record.add(nodeName);
            record.add(taskAttemptInfos.size() + "");
            record.add(getNumberOfTasks(taskAttemptInfos, TaskAttemptState.KILLED) + "");
            record.add(getNumberOfTasks(taskAttemptInfos, TaskAttemptState.FAILED) + "");

            Iterable<TaskAttemptInfo> succeedTasks = getFilteredTaskAttempts(taskAttemptInfos,
                    TaskAttemptState.SUCCEEDED);
            record.add(getAvgTaskExecutionTime(succeedTasks) + "");

            Iterable<TaskAttemptInfo> killedTasks = getFilteredTaskAttempts(taskAttemptInfos,
                    TaskAttemptState.KILLED);
            record.add(getAvgTaskExecutionTime(killedTasks) + "");

            Iterable<TaskAttemptInfo> failedTasks = getFilteredTaskAttempts(taskAttemptInfos,
                    TaskAttemptState.FAILED);
            record.add(getAvgTaskExecutionTime(failedTasks) + "");

            record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class.getName(),
                    FileSystemCounter.HDFS_BYTES_READ.name()) + "");
            record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class.getName(),
                    FileSystemCounter.HDFS_BYTES_WRITTEN.name()) + "");
            record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class.getName(),
                    FileSystemCounter.FILE_BYTES_READ.name()) + "");
            record.add(getAvgCounter(taskAttemptInfos, FileSystemCounter.class.getName(),
                    FileSystemCounter.FILE_BYTES_WRITTEN.name()) + "");
            record.add(
                    getAvgCounter(taskAttemptInfos, TaskCounter.class.getName(), TaskCounter.GC_TIME_MILLIS.name())
                            + "");
            record.add(getAvgCounter(taskAttemptInfos, TaskCounter.class.getName(),
                    TaskCounter.CPU_MILLISECONDS.name()) + "");

            csvResult.addRecord(record.toArray(new String[record.size()]));
        }
    }

    private Iterable<TaskAttemptInfo> getFilteredTaskAttempts(Collection<TaskAttemptInfo> taskAttemptInfos,
            final TaskAttemptState status) {
        return Iterables.filter(taskAttemptInfos, new Predicate<TaskAttemptInfo>() {
            @Override
            public boolean apply(TaskAttemptInfo input) {
                return input.getStatus().equalsIgnoreCase(status.toString());
            }
        });
    }

    private float getAvgTaskExecutionTime(Iterable<TaskAttemptInfo> taskAttemptInfos) {
        long totalTime = 0;
        int size = 0;
        for (TaskAttemptInfo attemptInfo : taskAttemptInfos) {
            totalTime += attemptInfo.getTimeTaken();
            size++;
        }
        return (size > 0) ? (totalTime * 1.0f / size) : 0;
    }

    private int getNumberOfTasks(Collection<TaskAttemptInfo> taskAttemptInfos, TaskAttemptState status) {
        int tasks = 0;
        for (TaskAttemptInfo attemptInfo : taskAttemptInfos) {
            if (attemptInfo.getStatus().equalsIgnoreCase(status.toString())) {
                tasks++;
            }
        }
        return tasks;
    }

    private float getAvgCounter(Collection<TaskAttemptInfo> taskAttemptInfos, String counterGroupName,
            String counterName) {
        long total = 0;
        int taskCount = 0;
        for (TaskAttemptInfo attemptInfo : taskAttemptInfos) {
            TezCounters tezCounters = attemptInfo.getTezCounters();
            TezCounter counter = tezCounters.findCounter(counterGroupName, counterName);
            if (counter != null) {
                total += counter.getValue();
                taskCount++;
            } else {
                LOG.info("Could not find counterGroupName=" + counterGroupName + ", counter=" + counterName + " in "
                        + attemptInfo);
            }
        }
        return (taskCount > 0) ? (total * 1.0f / taskCount) : 0;
    }

    @Override
    public CSVResult getResult() throws TezException {
        return csvResult;
    }

    @Override
    public String getName() {
        return "Slow Node Analyzer";
    }

    @Override
    public String getDescription() {
        StringBuilder sb = new StringBuilder();
        sb.append("Analyze node details for the DAG.").append("\n");
        sb.append("This could be used to find out the set of nodes where the tasks are taking more "
                + "time on average.").append("\n");
        sb.append("This could be used to find out the set of nodes where the tasks are taking more "
                + "time on average and to understand whether too many tasks got scheduled on a node.").append("\n");
        sb.append("One needs to combine the task execution time with other metrics like bytes "
                + "read/written etc to get better idea of bad nodes. In order to understand the slow "
                + "nodes due to network, it might be worthwhile to consider the shuffle performance "
                + "analyzer tool in tez-tools").append("\n");
        return sb.toString();
    }

    @Override
    public Configuration getConfiguration() {
        return config;
    }

    public static void main(String[] args) throws Exception {
        Configuration config = new Configuration();
        SlowNodeAnalyzer analyzer = new SlowNodeAnalyzer(config);
        int res = ToolRunner.run(config, analyzer, args);
        analyzer.printResults();
        System.exit(res);
    }
}