org.apache.hadoop.chukwa.inputtools.mdl.TorqueInfoProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.chukwa.inputtools.mdl.TorqueInfoProcessor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.chukwa.inputtools.mdl;

import java.sql.SQLException;
import java.sql.ResultSet;
import java.lang.Exception;
import java.util.Calendar;
import java.util.Set;
import java.util.TreeSet;
import java.util.TreeMap;
import java.util.Iterator;
import java.lang.StringBuffer;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.lang.Thread;
import java.util.Timer;
import java.lang.ProcessBuilder;
import java.lang.Process;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.InterruptedException;
import java.lang.System;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.chukwa.inputtools.mdl.DataConfig;
import org.apache.hadoop.chukwa.inputtools.mdl.TorqueTimerTask;
import org.apache.hadoop.chukwa.inputtools.mdl.ErStreamHandler;
import org.apache.hadoop.chukwa.util.DatabaseWriter;

public class TorqueInfoProcessor {

    private static Log log = LogFactory.getLog(TorqueInfoProcessor.class);

    private int intervalValue = 60;
    private String torqueServer = null;
    private String torqueBinDir = null;
    private String domain = null;

    private TreeMap<String, TreeMap<String, String>> currentHodJobs;

    public TorqueInfoProcessor(DataConfig mdlConfig, int interval) {
        this.intervalValue = interval;

        torqueServer = System.getProperty("TORQUE_SERVER");
        torqueBinDir = System.getProperty("TORQUE_HOME") + File.separator + "bin";
        domain = System.getProperty("DOMAIN");
        currentHodJobs = new TreeMap<String, TreeMap<String, String>>();
    }

    public void setup(boolean recover) throws Exception {
    }

    private void getHodJobInfo() throws IOException {
        StringBuffer sb = new StringBuffer();
        sb.append(torqueBinDir).append("/qstat -a");

        String[] getQueueInfoCommand = new String[3];
        getQueueInfoCommand[0] = "ssh";
        getQueueInfoCommand[1] = torqueServer;
        getQueueInfoCommand[2] = sb.toString();

        String command = getQueueInfoCommand[0] + " " + getQueueInfoCommand[1] + " " + getQueueInfoCommand[2];
        ProcessBuilder pb = new ProcessBuilder(getQueueInfoCommand);

        Process p = pb.start();

        Timer timeout = new Timer();
        TorqueTimerTask torqueTimerTask = new TorqueTimerTask(p, command);
        timeout.schedule(torqueTimerTask, TorqueTimerTask.timeoutInterval * 1000);

        BufferedReader result = new BufferedReader(new InputStreamReader(p.getInputStream()));
        ErStreamHandler errorHandler = new ErStreamHandler(p.getErrorStream(), command, true);
        errorHandler.start();

        String line = null;
        boolean start = false;
        TreeSet<String> jobsInTorque = new TreeSet<String>();
        while ((line = result.readLine()) != null) {
            if (line.startsWith("---")) {
                start = true;
                continue;
            }

            if (start) {
                String[] items = line.split("\\s+");
                if (items.length >= 10) {
                    String hodIdLong = items[0];
                    String hodId = hodIdLong.split("[.]")[0];
                    String userId = items[1];
                    String numOfMachine = items[5];
                    String status = items[9];
                    jobsInTorque.add(hodId);
                    if (!currentHodJobs.containsKey(hodId)) {
                        TreeMap<String, String> aJobData = new TreeMap<String, String>();

                        aJobData.put("userId", userId);
                        aJobData.put("numOfMachine", numOfMachine);
                        aJobData.put("traceCheckCount", "0");
                        aJobData.put("process", "0");
                        aJobData.put("status", status);
                        currentHodJobs.put(hodId, aJobData);
                    } else {
                        TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
                        aJobData.put("status", status);
                        currentHodJobs.put(hodId, aJobData);
                    } // if..else
                }
            }
        } // while

        try {
            errorHandler.join();
        } catch (InterruptedException ie) {
            log.error(ie.getMessage());
        }
        timeout.cancel();

        Set<String> currentHodJobIds = currentHodJobs.keySet();
        Iterator<String> currentHodJobIdsIt = currentHodJobIds.iterator();
        TreeSet<String> finishedHodIds = new TreeSet<String>();
        while (currentHodJobIdsIt.hasNext()) {
            String hodId = currentHodJobIdsIt.next();
            if (!jobsInTorque.contains(hodId)) {
                TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
                String process = aJobData.get("process");
                if (process.equals("0") || process.equals("1")) {
                    aJobData.put("status", "C");
                } else {
                    finishedHodIds.add(hodId);
                }
            }
        } // while

        Iterator<String> finishedHodIdsIt = finishedHodIds.iterator();
        while (finishedHodIdsIt.hasNext()) {
            String hodId = finishedHodIdsIt.next();
            currentHodJobs.remove(hodId);
        }

    }

    private boolean loadQstatData(String hodId) throws IOException, SQLException {
        TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
        String userId = aJobData.get("userId");

        StringBuffer sb = new StringBuffer();
        sb.append(torqueBinDir).append("/qstat -f -1 ").append(hodId);
        String[] qstatCommand = new String[3];
        qstatCommand[0] = "ssh";
        qstatCommand[1] = torqueServer;
        qstatCommand[2] = sb.toString();

        String command = qstatCommand[0] + " " + qstatCommand[1] + " " + qstatCommand[2];
        ProcessBuilder pb = new ProcessBuilder(qstatCommand);
        Process p = pb.start();

        Timer timeout = new Timer();
        TorqueTimerTask torqueTimerTask = new TorqueTimerTask(p, command);
        timeout.schedule(torqueTimerTask, TorqueTimerTask.timeoutInterval * 1000);

        BufferedReader result = new BufferedReader(new InputStreamReader(p.getInputStream()));
        ErStreamHandler errorHandler = new ErStreamHandler(p.getErrorStream(), command, false);
        errorHandler.start();
        String line = null;
        String hosts = null;
        long startTimeValue = -1;
        long endTimeValue = Calendar.getInstance().getTimeInMillis();
        long executeTimeValue = Calendar.getInstance().getTimeInMillis();
        boolean qstatfinished;

        while ((line = result.readLine()) != null) {
            if (line.indexOf("ctime") >= 0) {
                String startTime = line.split("=")[1].trim();
                // Tue Sep 9 23:44:29 2008
                SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy");
                Date startTimeDate;
                try {
                    startTimeDate = sdf.parse(startTime);
                    startTimeValue = startTimeDate.getTime();
                } catch (ParseException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

            }
            if (line.indexOf("mtime") >= 0) {
                String endTime = line.split("=")[1].trim();
                SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy");
                Date endTimeDate;
                try {
                    endTimeDate = sdf.parse(endTime);
                    endTimeValue = endTimeDate.getTime();
                } catch (ParseException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

            }
            if (line.indexOf("etime") >= 0) {
                String executeTime = line.split("=")[1].trim();
                SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy");
                Date executeTimeDate;
                try {
                    executeTimeDate = sdf.parse(executeTime);
                    executeTimeValue = executeTimeDate.getTime();
                } catch (ParseException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

            }
            if (line.indexOf("exec_host") >= 0) {
                hosts = line.split("=")[1].trim();
            }
        }

        if (hosts != null && startTimeValue >= 0) {
            String[] items2 = hosts.split("[+]");
            int num = 0;
            for (int i = 0; i < items2.length; i++) {
                String machinetmp = items2[i];
                if (machinetmp.length() > 3) {
                    String machine = items2[i].substring(0, items2[i].length() - 2);
                    StringBuffer data = new StringBuffer();
                    data.append("HodId=").append(hodId);
                    data.append(", Machine=").append(machine);
                    if (domain != null) {
                        data.append(".").append(domain);
                    }
                    log.info(data);
                    num++;
                }
            }
            Timestamp startTimedb = new Timestamp(startTimeValue);
            Timestamp endTimedb = new Timestamp(endTimeValue);
            StringBuffer data = new StringBuffer();
            long timeQueued = executeTimeValue - startTimeValue;
            data.append("HodID=").append(hodId);
            data.append(", UserId=").append(userId);
            data.append(", StartTime=").append(startTimedb);
            data.append(", TimeQueued=").append(timeQueued);
            data.append(", NumOfMachines=").append(num);
            data.append(", EndTime=").append(endTimedb);
            log.info(data);
            qstatfinished = true;

        } else {

            qstatfinished = false;
        }

        try {
            errorHandler.join();
        } catch (InterruptedException ie) {
            log.error(ie.getMessage());
        }
        result.close();
        timeout.cancel();

        return qstatfinished;
    }

    private boolean loadTraceJobData(String hodId) throws IOException, SQLException {
        TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
        String userId = aJobData.get("userId");
        String process = aJobData.get("process");

        StringBuffer sb = new StringBuffer();
        sb.append(torqueBinDir).append("/tracejob -n 10 -l -m -s ").append(hodId);
        String[] traceJobCommand = new String[3];
        traceJobCommand[0] = "ssh";
        traceJobCommand[1] = torqueServer;
        traceJobCommand[2] = sb.toString();

        String command = traceJobCommand[0] + " " + traceJobCommand[1] + " " + traceJobCommand[2];
        ProcessBuilder pb = new ProcessBuilder(traceJobCommand);

        Process p = pb.start();

        Timer timeout = new Timer();
        TorqueTimerTask torqueTimerTask = new TorqueTimerTask(p, command);
        timeout.schedule(torqueTimerTask, TorqueTimerTask.timeoutInterval * 1000);

        BufferedReader result = new BufferedReader(new InputStreamReader(p.getInputStream()));
        ErStreamHandler errorHandler = new ErStreamHandler(p.getErrorStream(), command, false);
        errorHandler.start();
        String line = null;
        String exit_status = null;
        String hosts = null;
        long timeQueued = -1;
        long startTimeValue = -1;
        long endTimeValue = -1;
        boolean findResult = false;

        while ((line = result.readLine()) != null && !findResult) {
            if (line.indexOf("end") >= 0 && line.indexOf("Exit_status") >= 0 && line.indexOf("qtime") >= 0) {
                TreeMap<String, String> jobData = new TreeMap<String, String>();
                String[] items = line.split("\\s+");
                for (int i = 0; i < items.length; i++) {
                    String[] items2 = items[i].split("=");
                    if (items2.length >= 2) {
                        jobData.put(items2[0], items2[1]);
                    }

                }
                String startTime = jobData.get("ctime");
                startTimeValue = Long.valueOf(startTime);
                startTimeValue = startTimeValue - startTimeValue % (60);
                Timestamp startTimedb = new Timestamp(startTimeValue * 1000);

                String queueTime = jobData.get("qtime");
                long queueTimeValue = Long.valueOf(queueTime);

                String sTime = jobData.get("start");
                long sTimeValue = Long.valueOf(sTime);

                timeQueued = sTimeValue - queueTimeValue;

                String endTime = jobData.get("end");
                endTimeValue = Long.valueOf(endTime);
                endTimeValue = endTimeValue - endTimeValue % (60);
                Timestamp endTimedb = new Timestamp(endTimeValue * 1000);

                exit_status = jobData.get("Exit_status");
                hosts = jobData.get("exec_host");
                String[] items2 = hosts.split("[+]");
                int num = 0;
                for (int i = 0; i < items2.length; i++) {
                    String machinetemp = items2[i];
                    if (machinetemp.length() >= 3) {
                        String machine = items2[i].substring(0, items2[i].length() - 2);
                        StringBuffer data = new StringBuffer();
                        data.append("HodId=").append(hodId);
                        data.append(", Machine=").append(machine);
                        if (domain != null) {
                            data.append(".").append(domain);
                        }
                        log.info(data.toString());
                        num++;
                    }
                }

                StringBuffer data = new StringBuffer();
                data.append("HodID=").append(hodId);
                data.append(", UserId=").append(userId);
                data.append(", Status=").append(exit_status);
                data.append(", TimeQueued=").append(timeQueued);
                data.append(", StartTime=").append(startTimedb);
                data.append(", EndTime=").append(endTimedb);
                data.append(", NumOfMachines=").append(num);
                log.info(data.toString());
                findResult = true;
                log.debug(" hod info for job " + hodId + " has been loaded ");
            } // if

        } // while

        try {
            errorHandler.join();
        } catch (InterruptedException ie) {
            log.error(ie.getMessage());
        }

        timeout.cancel();
        boolean tracedone = false;
        if (!findResult) {

            String traceCheckCount = aJobData.get("traceCheckCount");
            int traceCheckCountValue = Integer.valueOf(traceCheckCount);
            traceCheckCountValue = traceCheckCountValue + 1;
            aJobData.put("traceCheckCount", String.valueOf(traceCheckCountValue));

            log.debug("did not find tracejob info for job " + hodId + ", after " + traceCheckCountValue
                    + " times checking");
            if (traceCheckCountValue >= 2) {
                tracedone = true;
            }
        }
        boolean finished = findResult | tracedone;
        return finished;
    }

    private void process_data() throws SQLException {

        long currentTime = System.currentTimeMillis();
        currentTime = currentTime - currentTime % (60 * 1000);
        Timestamp timestamp = new Timestamp(currentTime);

        Set<String> hodIds = currentHodJobs.keySet();

        Iterator<String> hodIdsIt = hodIds.iterator();
        while (hodIdsIt.hasNext()) {
            String hodId = hodIdsIt.next();
            TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
            String status = aJobData.get("status");
            String process = aJobData.get("process");
            if (process.equals("0") && (status.equals("R") || status.equals("E"))) {
                try {
                    boolean result = loadQstatData(hodId);
                    if (result) {
                        aJobData.put("process", "1");
                        currentHodJobs.put(hodId, aJobData);
                    }
                } catch (IOException ioe) {
                    log.error("load qsat data Error:" + ioe.getMessage());

                }
            }
            if (!process.equals("2") && status.equals("C")) {
                try {
                    boolean result = loadTraceJobData(hodId);

                    if (result) {
                        aJobData.put("process", "2");
                        currentHodJobs.put(hodId, aJobData);
                    }
                } catch (IOException ioe) {
                    log.error("loadTraceJobData Error:" + ioe.getMessage());
                }
            } // if

        } // while

    }

    private void handle_jobData() throws SQLException {
        try {
            getHodJobInfo();
        } catch (IOException ex) {
            log.error("getQueueInfo Error:" + ex.getMessage());
            return;
        }
        try {
            process_data();
        } catch (SQLException ex) {
            log.error("process_data Error:" + ex.getMessage());
            throw ex;
        }
    }

    public void run_forever() throws SQLException {
        while (true) {
            handle_jobData();
            try {
                log.debug("sleeping ...");
                Thread.sleep(this.intervalValue * 1000);
            } catch (InterruptedException e) {
                log.error(e.getMessage());
            }
        }
    }

    public void shutdown() {
    }
}