com.impetus.ankush2.hadoop.monitor.JobStatusProvider.java Source code

Java tutorial

Introduction

Here is the source code for com.impetus.ankush2.hadoop.monitor.JobStatusProvider.java

Source

/*******************************************************************************
 * ===========================================================
 * Ankush : Big Data Cluster Management Solution
 * ===========================================================
 * 
 * (C) Copyright 2014, by Impetus Technologies
 * 
 * This is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License (LGPL v3) as
 * published by the Free Software Foundation;
 * 
 * This software is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License 
 * along with this software; if not, write to the Free Software Foundation, 
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 ******************************************************************************/
/**
 * 
 */
package com.impetus.ankush2.hadoop.monitor;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import net.neoremind.sshxcute.core.Result;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.JobTracker.State;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TIPStatus;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapred.TaskReport;

import com.impetus.ankush.common.exception.AnkushException;
import com.impetus.ankush.common.scripting.AnkushTask;
import com.impetus.ankush2.common.scripting.impl.ReadConfProperty;
import com.impetus.ankush2.constant.Constant;
import com.impetus.ankush2.framework.config.ClusterConfig;
import com.impetus.ankush2.framework.config.ComponentConfig;
import com.impetus.ankush2.framework.config.NodeConfig;
import com.impetus.ankush2.hadoop.utils.HadoopConstants;
import com.impetus.ankush2.hadoop.utils.HadoopUtils;
import com.impetus.ankush2.logger.AnkushLogger;

/**
 * @author Akhil
 * 
 */
public class JobStatusProvider {

    JobClient jobClient;

    String jobTrackerRpcPort;

    String jobTrackerHost;

    /**
     * @param clusterConfig
     * @param compConfig
     */
    public JobStatusProvider(ClusterConfig clusterConfig, ComponentConfig compConfig) {
        super();
        this.clusterConfig = clusterConfig;
        this.compConfig = compConfig;

        // job tracker port.
        jobTrackerRpcPort = HadoopUtils.getJobTrackerRpcPort(this.compConfig);

        // job tracker host
        jobTrackerHost = HadoopUtils.getJobTrackerHost(this.compConfig);

        jobClient = getJobClient(jobTrackerHost, jobTrackerRpcPort);
    }

    /** The cluster config. */
    private ClusterConfig clusterConfig;

    /** The hadoop config. */
    private ComponentConfig compConfig;

    /** The log. */
    private AnkushLogger LOG = new AnkushLogger(JobStatusProvider.class);
    /** The Constant JOB_STATE_PREP. */
    public static final int JOB_STATE_PREP = 4; // 4

    /** The Constant JOB_STATE_RUNNING. */
    public static final int JOB_STATE_RUNNING = 1;

    /** The Constant JOB_STATE_SUCCEEDED. */
    public static final int JOB_STATE_SUCCEEDED = 2;

    /** The Constant JOB_STATE_FAILED. */
    public static final int JOB_STATE_FAILED = 3;

    /** The Constant JOB_STATE_KILLED. */
    public static final int JOB_STATE_KILLED = 5;

    /**
     * Gets the job metrics.
     * 
     * @return List
     */
    public Map<String, Object> getJobMetrics() throws AnkushException {
        String errMsg = "Unable to getch Hadoop Metrics, could not connect to Hadoop JobClient.";
        try {
            // Checking for !null
            if (jobClient != null) {
                // Creating an empty map for storing Hadoop Job Metrics information
                LinkedHashMap<String, Object> hadoopJobMetrics = new LinkedHashMap<String, Object>();
                try {
                    // Checking for null jobClient
                    if (jobClient != null) {
                        LOG.info("Fetching Hadoop Metrics Information.." + jobClient);
                        // Get status information about the Map-Reduce cluster.
                        ClusterStatus clusterStatus = jobClient.getClusterStatus();
                        // Get the current state of the JobTracker,
                        State jobTrackerState = clusterStatus.getJobTrackerState();
                        // Get the number of currently running map tasks in the cluster.
                        int mapTasks = clusterStatus.getMapTasks();
                        // Get the maximum capacity for running map tasks in the
                        // cluster.
                        int maxMapTasks = clusterStatus.getMaxMapTasks();
                        // Get the maximum capacity for running reduce tasks in the
                        // cluster.
                        int maxReduceTasks = clusterStatus.getMaxReduceTasks();
                        // Get the number of currently running reduce tasks in the
                        // cluster.
                        int reduceTasks = clusterStatus.getReduceTasks();
                        // Get the number of active task trackers in the cluster.
                        int taskTrackers = clusterStatus.getTaskTrackers();
                        // Get the number of blacklisted task trackers in the cluster.
                        int blackListedTrackers = clusterStatus.getBlacklistedTrackers();

                        long ttExpiryInterval = clusterStatus.getTTExpiryInterval();

                        int defaultMaps = 0;
                        int defaultReduces = 0;
                        try {
                            defaultMaps = jobClient.getDefaultMaps();
                            defaultReduces = jobClient.getDefaultReduces();
                        } catch (Exception e) {

                            //e.printStackTrace();
                        }

                        // Putting Hadoop Metrics information in a map
                        hadoopJobMetrics.put("jobTrackerState", String.valueOf(jobTrackerState));
                        hadoopJobMetrics.put("defaultMaps", String.valueOf(defaultMaps));
                        hadoopJobMetrics.put("defaultReduces", String.valueOf(defaultReduces));
                        hadoopJobMetrics.put("mapTasks", String.valueOf(mapTasks));
                        hadoopJobMetrics.put("reduceTasks", String.valueOf(reduceTasks));
                        hadoopJobMetrics.put("maxMapTasksCapacity", String.valueOf(maxMapTasks));
                        hadoopJobMetrics.put("maxReduceTasksCapacity", String.valueOf(maxReduceTasks));
                        hadoopJobMetrics.put("taskTrackers", String.valueOf(taskTrackers));
                        hadoopJobMetrics.put("blackListedTrackers", String.valueOf(blackListedTrackers));

                        hadoopJobMetrics.put("taskTrackerExpiryInterval", String.valueOf(ttExpiryInterval));

                        hadoopJobMetrics.put("schedulerType", getSchedulerType());

                        int totalJobSubmission = 0;
                        // Get the jobs that are submitted.
                        JobStatus[] jobStatus = jobClient.getAllJobs();
                        if (jobStatus != null) {
                            totalJobSubmission = jobClient.getAllJobs().length;
                        }

                        List<Map<String, Object>> allJobsList = listAllJobs();
                        int totalJobRunning = getRunningJobList(allJobsList).size();
                        int completedJobs = getCompletedJobs(allJobsList).size();

                        hadoopJobMetrics.put("totalJobSubmission", String.valueOf(totalJobSubmission));
                        hadoopJobMetrics.put("totalJobRunning", String.valueOf(totalJobRunning));
                        hadoopJobMetrics.put("totalJobsCompleted", String.valueOf(completedJobs));
                    } else {
                        HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg,
                                Constant.Component.Name.HADOOP);
                        throw new AnkushException(errMsg);
                    }
                } catch (AnkushException e) {
                    throw e;
                } catch (Exception e) {
                    HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP,
                            e);
                    throw new AnkushException(errMsg);
                }
                return hadoopJobMetrics;
            } else {
                throw new AnkushException(errMsg);
            }
        } catch (AnkushException e) {
            throw e;
        } catch (Exception e) {
            HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e);
            throw new AnkushException(errMsg);
        }
    }

    /**
     * Method getJobStatus.
     * 
     * @return Map<Object,Object> The Map for Job Status Value.
     */
    public Map<String, Object> getJobStatus(String jobId) throws AnkushException {
        String errMsg = "Unable to getch Hadoop jobs list, could not connect to Hadoop JobClient.";
        // Creating an empty list of map for storing job Status information
        try {
            return getJobDetails(jobClient, jobId);
        } catch (Exception e) {
            HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e);
            throw new AnkushException(errMsg);
        }
    }

    /**
     * List all jobs.
     * 
     * @param jobClient
     *            the job client
     * @return the list
     * @throws InterruptedException
     */
    public List<Map<String, Object>> listAllJobs() throws AnkushException, InterruptedException {
        // Creating an empty list of map for storing job Status information
        String errMsg = "Unable to getch Hadoop jobs list, could not connect to Hadoop JobClient.";
        List<Map<String, Object>> jobReports = new ArrayList<Map<String, Object>>();
        try {
            // Checking for jobClient null
            if (jobClient != null) {
                // Get the jobs that are submitted.
                JobStatus[] jobStatus = jobClient.getAllJobs();
                // Iterating over the list of all submitted jobs
                for (JobStatus jobSts : jobStatus) {
                    jobReports.add(getJobReport(jobSts));
                }
            } else {
                HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP);
                throw new AnkushException(errMsg);
            }

        } catch (AnkushException e) {
            throw e;
        } catch (Exception e) {
            HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e);
            throw new AnkushException(errMsg);
        }
        return jobReports;
    }

    /**
     * @param jobClient
     * @param jobSts
     * @return
     * @throws IOException
     */
    private Map<String, Object> getJobReport(JobStatus jobSts) throws IOException {
        // Creating an empty map for storing job information
        Map<String, Object> jobReport = new HashMap<String, Object>();
        // Returns the jobid of the Job
        org.apache.hadoop.mapred.JobID jobId = jobSts.getJobID();
        // Get an RunningJob object to track an ongoing Map-Reduce
        // job.
        RunningJob job = jobClient.getJob(jobId);
        String jobName = "";
        if (job != null) {
            // Get the name of the job.
            jobName = job.getJobName();
        }
        // Percentage of progress in maps
        float mapProgress = jobSts.mapProgress() * 100;
        // Percentage of progress in reduce
        float reduceProgress = jobSts.reduceProgress() * 100;

        int mapTotal = 0;
        int reduceTotal = 0;
        int mapComp = 0;
        int reduceComp = 0;

        // Count for Map and Reduce Complete
        try {
            // Get the information of the current state of the map
            // tasks of a job
            TaskReport[] mapTaskReports = jobClient.getMapTaskReports(jobId);
            // Get the total map
            mapTotal = mapTaskReports.length;
            // Iterating over the map tasks
            for (TaskReport taskReport : mapTaskReports) {
                // The current state of a map TaskInProgress as seen
                // by the JobTracker.
                TIPStatus currentStatus = taskReport.getCurrentStatus();
                if (currentStatus == TIPStatus.COMPLETE) {
                    mapComp++;
                }
            }

            // Get the information of the current state of the
            // reduce tasks of a job.
            TaskReport[] reduceTaskReport = jobClient.getReduceTaskReports(jobId);
            // Get the total reduce
            reduceTotal = reduceTaskReport.length;
            // Iterating over the reduce tasks
            for (TaskReport taskReport : reduceTaskReport) {
                // The current state of a reduce TaskInProgress as
                // seen by the JobTracker.
                TIPStatus currentStatus = taskReport.getCurrentStatus();
                if (currentStatus == TIPStatus.COMPLETE) {
                    reduceComp++;
                }
            }
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
        }
        // Percentage of progress in setup
        float setupProgress = jobSts.setupProgress() * 100;
        // The progress made on cleanup
        float cleanupProgress = jobSts.cleanupProgress() * 100;
        // gets any available info on the reason of failure of the
        // job..Returns the diagnostic information on why a job
        // might have failed.
        String failureInfo = jobSts.getFailureInfo();

        // Putting Job Sttaus information in map
        jobReport.put("jobId", jobId.toString());
        jobReport.put("jobName", jobName);
        jobReport.put("jobPriority", jobSts.getJobPriority().toString());
        jobReport.put("jobStartTime", jobSts.getStartTime());

        jobReport.put("userName", jobSts.getUsername());
        jobReport.put("jobComplete", jobSts.isJobComplete());

        jobReport.put("mapProgress", mapProgress);
        jobReport.put("reduceProgress", reduceProgress);

        jobReport.put("mapTotal", mapTotal);
        jobReport.put("reduceTotal", reduceTotal);
        jobReport.put("mapCompleted", mapComp);
        jobReport.put("reduceCompleted", reduceComp);

        jobReport.put("setupProgress", setupProgress);
        jobReport.put("cleanupProgress", cleanupProgress);

        jobReport.put("schedulingInfo", jobSts.getSchedulingInfo());
        jobReport.put("jobState", JobStatus.getJobRunState(jobSts.getRunState()));
        jobReport.put("failureInfo", failureInfo);
        jobReport.put("jobFile", job.getJobFile());
        jobReport.put("trackingURL", job.getTrackingURL());

        jobReport.putAll(getDetailedJobReport(jobId));
        return jobReport;
    }

    /**
     * @param jobClient
     * @param jobReport
     * @param jobId
     * @param job
     * @throws IOException
     */
    private Map<String, Object> getDetailedJobReport(org.apache.hadoop.mapred.JobID jobId) throws IOException {
        Map<String, Object> jobDetailedReport = new HashMap<String, Object>();

        RunningJob job = jobClient.getJob(jobId);
        Counters counters = job.getCounters();
        List counterList = new ArrayList();
        for (Group group : counters) {
            Map<String, Object> counterMap = new HashMap<String, Object>();
            counterMap.put("name", group.getDisplayName());
            List subCounters = new ArrayList();
            for (Counter counter : group) {
                Map subCounter = new HashMap();
                subCounter.put("name", counter.getDisplayName());
                subCounter.put("value", counter.getCounter());
                subCounters.add(subCounter);
            }
            counterMap.put("subCounters", subCounters);
            counterList.add(counterMap);
        }
        jobDetailedReport.put("counters", counterList);
        jobDetailedReport.put("mapReport", getTaskReport(jobClient.getMapTaskReports(jobId)));
        jobDetailedReport.put("reduceReport", getTaskReport(jobClient.getReduceTaskReports(jobId)));
        jobDetailedReport.put("cleanupReport", getTaskReport(jobClient.getCleanupTaskReports(jobId)));
        jobDetailedReport.put("setupReport", getTaskReport(jobClient.getSetupTaskReports(jobId)));
        return jobDetailedReport;
    }

    public Map<String, Object> getJobDetails(JobClient jobClient, String jobId) throws AnkushException {
        String errMsg = "Unable to getch Hadoop jobs details, could not connect to Hadoop JobClient.";
        try {
            if (jobClient != null) {
                // Get the jobs that are submitted.
                JobStatus[] jobStatus = jobClient.getAllJobs();
                for (JobStatus jobSts : jobStatus) {

                }
            }
        } catch (Exception e) {
            HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e);
            throw new AnkushException(errMsg);
        }
        return null;
    }

    /**
     * Gets the scheduler type.
     * 
     * @return String
     */
    private String getSchedulerType() {
        String schedulerType = "default";
        try {
            String hadoopConfPath = HadoopUtils.getHadoopConfDir(this.compConfig);
            String mapredFilePath = hadoopConfPath + HadoopConstants.FileName.ConfigurationFile.XML_MAPRED_SITE;

            AnkushTask readXmlProperty = new ReadConfProperty("mapred.jobtracker.taskScheduler", mapredFilePath,
                    Constant.File_Extension.XML, this.clusterConfig.getAgentInstallDir());

            NodeConfig jobTrackerHost = this.clusterConfig.getNodes()
                    .get(HadoopUtils.getJobTrackerHost(compConfig));
            Result result = jobTrackerHost.getConnection().exec(readXmlProperty);

            if (result.isSuccess) {
                // Checking for not null
                if (result.sysout != null) {
                    if (result.sysout.contains("FairScheduler")) {
                        schedulerType = "Fair";
                    }
                    if (result.sysout.contains("CapacityTaskScheduler")) {
                        schedulerType = "Capacity";
                    }
                }
            }
        } catch (Exception e) {
            HadoopUtils.addAndLogError(this.LOG, this.clusterConfig,
                    "Could not get Scheduler type from "
                            + HadoopConstants.FileName.ConfigurationFile.XML_MAPRED_SITE + " file.",
                    Constant.Component.Name.HADOOP, e);
        }
        // returns the scheduler type
        return schedulerType;
    }

    /**
     * Gets the running job list.
     * 
     * @param allJobsList
     *            the all jobs list
     * @return List
     */
    private List<Map<String, Object>> getRunningJobList(List<Map<String, Object>> allJobsList) {
        return getJobList("RUNNING", allJobsList);
    }

    /**
     * Gets the completed jobs.
     * 
     * @param allJobsList
     *            the all jobs list
     * @return the completed jobs
     */
    private List<Map<String, Object>> getCompletedJobs(List<Map<String, Object>> allJobsList) {
        return getJobList("SUCCEEDED", allJobsList);
    }

    /**
     * Getting the job list via state.
     * 
     * @param state
     * @param allJobsList
     * @return
     */
    private List<Map<String, Object>> getJobList(String state, List<Map<String, Object>> allJobsList) {
        // Creating an empty list of map for storing completed job information
        List<Map<String, Object>> completedJobList = new ArrayList<Map<String, Object>>();
        // Iterating over all jobs list that are submitted to the cluster
        for (Map<String, Object> jAct : allJobsList) {
            // Extracting job state
            String jobState = (String) jAct.get("jobState");
            if (jobState.equals(state)) {
                completedJobList.add(jAct);
            }
        }
        return completedJobList;
    }

    /**
     * Gets the job client.
     * 
     * @return JobClient object
     */
    private JobClient getJobClient(String host, String port) {
        // JobClient is the primary interface for the user-job to interact with
        // the JobTracker.
        JobClient jobClient = null;
        if (host == null) {
            host = "localhost";
        }
        LOG.info("Requesting job Client..");
        try {
            // Provides access to configuration parameters.
            Configuration conf = new Configuration();
            LOG.info("JobClient : " + host + " & port : " + port);
            // Build a job client, connect to the indicated job tracker.
            jobClient = new JobClient(new InetSocketAddress(host, Integer.parseInt(port)), new JobConf(conf));
            // Set the configuration to be used by this object.
            jobClient.setConf(conf);
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
        }

        return jobClient;
    }

    /**
     * Gets the task report.
     * 
     * @param taskReports
     *            the task reports
     * @return the task report
     */
    private Map<String, Object> getTaskReport(TaskReport[] taskReports) {
        Map<String, Object> taskReportsInfo = new HashMap<String, Object>();
        try {
            LOG.info("Total Task : " + taskReports.length);
            List<Map> taskLists = new ArrayList<Map>();
            // A report on the state of a task.
            if (taskReports != null) {
                int completeTask = 0;
                int failedTask = 0;
                int killedTask = 0;
                int runningTask = 0;
                int pendingTask = 0;
                Map<String, Object[]> diagInfo = new HashMap<String, Object[]>();
                // Iterating over the task reports
                for (TaskReport mtr : taskReports) {
                    // Creating an empty map for storing task details
                    Map<String, Object> taskReport = new HashMap<String, Object>();
                    // The current status of the task
                    TIPStatus currentStatus = mtr.getCurrentStatus();
                    // Checking for task's current status COMPLETE
                    if (currentStatus == TIPStatus.COMPLETE) {
                        completeTask++;
                    }
                    // Checking for task's current status KILLED
                    if (currentStatus == TIPStatus.KILLED) {
                        killedTask++;
                    }
                    // Checking for task's current status RUNNING
                    if (currentStatus == TIPStatus.RUNNING) {
                        runningTask++;
                    }
                    // Checking for task's current status PENDING
                    if (currentStatus == TIPStatus.PENDING) {
                        pendingTask++;
                    }
                    // The id of the task.
                    TaskID taskId = mtr.getTaskID();
                    float progress = mtr.getProgress();
                    // The most recent state
                    String state = mtr.getState();

                    // Putting value in a map
                    taskReport.put("taskId", taskId.toString());
                    taskReport.put("successfulTaskAttemp", mtr.getSuccessfulTaskAttempt().toString());
                    taskReport.put("startTime", mtr.getStartTime());
                    taskReport.put("finishTime", mtr.getFinishTime());
                    taskReport.put("progress", progress * 100);
                    taskReport.put("state", state);
                    taskReport.put("currentStatus", currentStatus);
                    Counters counters = mtr.getCounters();
                    List countersList = new ArrayList();
                    for (Group group : counters) {
                        Map<String, Object> counterMap = new HashMap<String, Object>();
                        counterMap.put("name", group.getDisplayName());
                        List subCounters = new ArrayList();
                        for (Counter counter : group) {
                            Map subCounter = new HashMap();
                            subCounter.put("name", counter.getDisplayName());
                            subCounter.put("value", counter.getCounter());
                            subCounters.add(subCounter);
                        }
                        counterMap.put("subCounters", subCounters);
                        countersList.add(counterMap);
                    }
                    taskReport.put("counters", countersList);
                    taskLists.add(taskReport);
                    // A list of error messages.
                    String[] diagnostics = mtr.getDiagnostics();
                    if (diagnostics != null) {
                        int count = 0;
                        // Iterating over the list of error messages
                        for (String di : diagnostics) {
                            Object[] diagStatus = new Object[2];
                            diagStatus[0] = taskId;
                            diagStatus[1] = di;
                            diagInfo.put(taskId + "_" + count, diagStatus);
                            count++;
                        }
                    }
                }
                // Putting value in a map
                taskReportsInfo.put("completedTask", completeTask);
                taskReportsInfo.put("pendingTask", pendingTask);
                taskReportsInfo.put("killedTask", killedTask);
                taskReportsInfo.put("runningTask", runningTask);
                taskReportsInfo.put("failedTask", failedTask);
                taskReportsInfo.put("failedOrKilledTask", failedTask);
                taskReportsInfo.put("diagInfo", diagInfo);
                taskReportsInfo.put("tasks", taskLists);
            }
        } catch (Exception e) {
            HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, "Could not get task report",
                    Constant.Component.Name.HADOOP, e);
        }
        return taskReportsInfo;
    }

}