org.apache.hadoop.mapred.TestKillSubProcesses.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapred.TestKillSubProcesses.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import java.util.Iterator;
import java.util.StringTokenizer;

import junit.framework.TestCase;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ProcessTree;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.Shell.ExitCodeException;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * A JUnit test to test Kill Job that has tasks with children and checks if the
 * children(subprocesses of java task) are also killed when a task is killed.
 */
public class TestKillSubProcesses extends TestCase {

    private static volatile Log LOG = LogFactory.getLog(TestKillSubProcesses.class);

    private static String BASE_TEST_ROOT_DIR = new File(System.getProperty("test.build.data", "/tmp"))
            .getAbsolutePath();
    private static String TEST_ROOT_DIR = BASE_TEST_ROOT_DIR + Path.SEPARATOR + "killSubProcesses";

    private static Path scriptDir = new Path(TEST_ROOT_DIR, "script");
    private static String scriptDirName = scriptDir.toUri().getPath();
    private static Path signalFile = new Path(TEST_ROOT_DIR + "/script/signalFile");

    private static JobClient jobClient = null;

    static MiniMRCluster mr = null;

    private static String pid = null;

    // number of levels in the subtree of subprocesses of map task
    private static int numLevelsOfSubProcesses = 4;

    /**
     * Runs a job, kills the job and verifies if the map task and its
     * subprocesses are also killed properly or not.
     */
    private static void runKillingJobAndValidate(JobTracker jt, JobConf conf) throws IOException {

        conf.setJobName("testkilljobsubprocesses");
        conf.setMapperClass(KillingMapperWithChildren.class);

        RunningJob job = runJobAndSetProcessHandle(jt, conf);

        // kill the job now
        job.killJob();

        while (job.cleanupProgress() == 0.0f) {
            try {
                Thread.sleep(100);
            } catch (InterruptedException ie) {
                LOG.warn("sleep is interrupted:" + ie);
                break;
            }
        }

        validateKillingSubprocesses(job, conf);
        // Checking the Job status
        assertEquals(job.getJobState(), JobStatus.KILLED);
    }

    /**
     * Runs a job that will fail and verifies if the subprocesses of failed map
     * task are killed properly or not.
     */
    private static void runFailingJobAndValidate(JobTracker jt, JobConf conf) throws IOException {

        conf.setJobName("testfailjobsubprocesses");
        conf.setMapperClass(FailingMapperWithChildren.class);

        // We don't want to run the failing map task 4 times. So we run it once and
        // check if all the subprocesses are killed properly.
        conf.setMaxMapAttempts(1);

        RunningJob job = runJobAndSetProcessHandle(jt, conf);
        signalTask(signalFile.toString(), conf);
        validateKillingSubprocesses(job, conf);
        // Checking the Job status
        assertEquals(job.getJobState(), JobStatus.FAILED);
    }

    /**
     * Runs a job that will succeed and verifies if the subprocesses of succeeded
     * map task are killed properly or not.
     */
    private static void runSuccessfulJobAndValidate(JobTracker jt, JobConf conf) throws IOException {

        conf.setJobName("testsucceedjobsubprocesses");
        conf.setMapperClass(MapperWithChildren.class);

        RunningJob job = runJobAndSetProcessHandle(jt, conf);
        signalTask(signalFile.toString(), conf);
        validateKillingSubprocesses(job, conf);
        // Checking the Job status
        assertEquals(job.getJobState(), JobStatus.SUCCEEDED);
    }

    /**
     * Runs the given job and saves the pid of map task.
     * Also checks if the subprocesses of map task are alive.
     */
    private static RunningJob runJobAndSetProcessHandle(JobTracker jt, JobConf conf) throws IOException {
        RunningJob job = runJob(conf);
        while (job.getJobState() != JobStatus.RUNNING) {
            try {
                Thread.sleep(100);
            } catch (InterruptedException e) {
                break;
            }
        }

        pid = null;
        jobClient = new JobClient(conf);

        // get the taskAttemptID of the map task and use it to get the pid
        // of map task
        TaskReport[] mapReports = jobClient.getMapTaskReports(job.getID());

        JobInProgress jip = jt.getJob(job.getID());
        for (TaskReport tr : mapReports) {
            TaskInProgress tip = jip.getTaskInProgress(tr.getTaskID());

            // for this tip, get active tasks of all attempts
            while (tip.getActiveTasks().size() == 0) {
                //wait till the activeTasks Tree is built
                try {
                    Thread.sleep(500);
                } catch (InterruptedException ie) {
                    LOG.warn("sleep is interrupted:" + ie);
                    break;
                }
            }

            for (Iterator<TaskAttemptID> it = tip.getActiveTasks().keySet().iterator(); it.hasNext();) {
                TaskAttemptID id = it.next();
                LOG.info("taskAttemptID of map task is " + id);

                while (pid == null) {
                    pid = mr.getTaskTrackerRunner(0).getTaskTracker().getPid(id);
                    if (pid == null) {
                        try {
                            Thread.sleep(500);
                        } catch (InterruptedException e) {
                        }
                    }
                }
                LOG.info("pid of map task is " + pid);
                //Checking if the map task is alive
                assertTrue("Map is no more alive", isAlive(pid));
                LOG.info("The map task is alive before Job completion, as expected.");
            }
        }

        // Checking if the descendant processes of map task are alive
        if (ProcessTree.isSetsidAvailable) {
            String childPid = UtilsForTests.getPidFromPidFile(scriptDirName + "/childPidFile" + 0);
            while (childPid == null) {
                LOG.warn(scriptDirName + "/childPidFile" + 0 + " is null; Sleeping...");
                try {
                    Thread.sleep(500);
                } catch (InterruptedException ie) {
                    LOG.warn("sleep is interrupted:" + ie);
                    break;
                }
                childPid = UtilsForTests.getPidFromPidFile(scriptDirName + "/childPidFile" + 0);
            }

            // As childPidFile0(leaf process in the subtree of processes with
            // map task as root) is created, all other child pid files should
            // have been created already(See the script for details).
            // Now check if the descendants of map task are alive.
            for (int i = 0; i <= numLevelsOfSubProcesses; i++) {
                childPid = UtilsForTests.getPidFromPidFile(scriptDirName + "/childPidFile" + i);
                LOG.info("pid of the descendant process at level " + i
                        + "in the subtree of processes(with the map task as the root)" + " is " + childPid);
                assertTrue("Unexpected: The subprocess at level " + i
                        + " in the subtree is not alive before Job completion", isAlive(childPid));
            }
        }
        return job;
    }

    /**
     * Verifies if the subprocesses of the map task are killed properly.
     */
    private static void validateKillingSubprocesses(RunningJob job, JobConf conf) throws IOException {
        // wait till the the job finishes
        while (!job.isComplete()) {
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                break;
            }
        }

        // Checking if the map task got killed or not
        assertTrue(!ProcessTree.isAlive(pid));
        LOG.info("The map task is not alive after Job is completed, as expected.");

        // Checking if the descendant processes of map task are killed properly
        if (ProcessTree.isSetsidAvailable) {
            for (int i = 0; i <= numLevelsOfSubProcesses; i++) {
                String childPid = UtilsForTests.getPidFromPidFile(scriptDirName + "/childPidFile" + i);
                LOG.info("pid of the descendant process at level " + i
                        + "in the subtree of processes(with the map task as the root)" + " is " + childPid);
                assertTrue("Unexpected: The subprocess at level " + i
                        + " in the subtree is alive after Job completion", !isAlive(childPid));
            }
        }
        FileSystem fs = FileSystem.getLocal(mr.createJobConf());
        if (fs.exists(scriptDir)) {
            fs.delete(scriptDir, true);
        }
    }

    private static RunningJob runJob(JobConf conf) throws IOException {

        final Path inDir;
        final Path outDir;
        FileSystem fs = FileSystem.getLocal(conf);
        FileSystem tempFs = FileSystem.get(conf);
        //Check if test is run with hdfs or local file system.
        //if local filesystem then prepend TEST_ROOT_DIR, otherwise
        //killjob folder would be created in workspace root.
        if (!tempFs.getUri().toASCIIString().equals(fs.getUri().toASCIIString())) {
            inDir = new Path("killjob/input");
            outDir = new Path("killjob/output");
        } else {
            inDir = new Path(TEST_ROOT_DIR, "input");
            outDir = new Path(TEST_ROOT_DIR, "output");
        }

        if (fs.exists(scriptDir)) {
            fs.delete(scriptDir, true);
        }

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(0);

        conf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS,
                conf.get(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, conf.get(JobConf.MAPRED_TASK_JAVA_OPTS))
                        + " -Dtest.build.data=" + BASE_TEST_ROOT_DIR);
        conf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS,
                conf.get(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, conf.get(JobConf.MAPRED_TASK_JAVA_OPTS))
                        + " -Dtest.build.data=" + BASE_TEST_ROOT_DIR);

        return UtilsForTests.runJob(conf, inDir, outDir);
    }

    public void testJobKillFailAndSucceed() throws IOException {
        if (Shell.WINDOWS) {
            System.out.println("setsid doesn't work on WINDOWS as expected. Not testing");
            return;
        }

        try {
            JobConf conf = new JobConf();
            conf.setLong(JvmManager.JvmManagerForType.DELAY_BEFORE_KILL_KEY, 0L);
            mr = new MiniMRCluster(1, "file:///", 1, null, null, conf);

            // run the TCs
            conf = mr.createJobConf();
            JobTracker jt = mr.getJobTrackerRunner().getJobTracker();
            runTests(conf, jt);
        } finally {
            if (mr != null) {
                mr.shutdown();
            }
        }
    }

    void runTests(JobConf conf, JobTracker jt) throws IOException {
        FileSystem fs = FileSystem.getLocal(mr.createJobConf());
        Path rootDir = new Path(TEST_ROOT_DIR);
        if (!fs.exists(rootDir)) {
            fs.mkdirs(rootDir);
        }
        fs.setPermission(rootDir, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
        runKillingJobAndValidate(jt, conf);
        runFailingJobAndValidate(jt, conf);
        runSuccessfulJobAndValidate(jt, conf);
    }

    /**
     * Creates signal file
     */
    private static void signalTask(String signalFile, JobConf conf) {
        try {
            FileSystem fs = FileSystem.getLocal(conf);
            fs.createNewFile(new Path(signalFile));
        } catch (IOException e) {
            LOG.warn("Unable to create signal file. " + e);
        }
    }

    /**
     * Runs a recursive shell script to create a chain of subprocesses
     */
    private static void runChildren(JobConf conf) throws IOException {
        if (ProcessTree.isSetsidAvailable) {
            FileSystem fs = FileSystem.getLocal(conf);

            if (fs.exists(scriptDir)) {
                fs.delete(scriptDir, true);
            }

            // Create the directory and set open permissions so that the TT can
            // access.
            fs.mkdirs(scriptDir);
            fs.setPermission(scriptDir, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));

            // create shell script
            Random rm = new Random();
            Path scriptPath = new Path(scriptDirName, "_shellScript_" + rm.nextInt() + ".sh");
            String shellScript = scriptPath.toString();

            // Construct the script. Set umask to 0000 so that TT can access all the
            // files.
            String script = "umask 000\n" + "echo $$ > " + scriptDirName + "/childPidFile" + "$1\n" + "echo hello\n"
                    + "trap 'echo got SIGTERM' 15 \n" + "if [ $1 != 0 ]\nthen\n" + " sh " + shellScript
                    + " $(($1-1))\n" + "else\n" + " while true\n do\n" + "  sleep 2\n" + " done\n" + "fi";
            DataOutputStream file = fs.create(scriptPath);
            file.writeBytes(script);
            file.close();

            // Set executable permissions on the script.
            new File(scriptPath.toUri().getPath()).setExecutable(true);

            LOG.info("Calling script from map task : " + shellScript);
            Runtime.getRuntime().exec(shellScript + " " + numLevelsOfSubProcesses);

            String childPid = UtilsForTests.getPidFromPidFile(scriptDirName + "/childPidFile" + 0);
            while (childPid == null) {
                LOG.warn(scriptDirName + "/childPidFile" + 0 + " is null; Sleeping...");
                try {
                    Thread.sleep(500);
                } catch (InterruptedException ie) {
                    LOG.warn("sleep is interrupted:" + ie);
                    break;
                }
                childPid = UtilsForTests.getPidFromPidFile(scriptDirName + "/childPidFile" + 0);
            }
        }
    }

    /**
     * Mapper that starts children
     */
    static class MapperWithChildren extends MapReduceBase
            implements Mapper<WritableComparable, Writable, WritableComparable, Writable> {
        FileSystem fs = null;

        public void configure(JobConf conf) {
            try {
                fs = FileSystem.getLocal(conf);
                runChildren(conf);
            } catch (Exception e) {
                LOG.warn("Exception in configure: " + StringUtils.stringifyException(e));
            }
        }

        // Mapper waits for the signal(signal is the existence of a file)
        public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> out,
                Reporter reporter) throws IOException {
            while (!fs.exists(signalFile)) {// wait for signal file creation
                try {
                    reporter.progress();
                    synchronized (this) {
                        this.wait(1000);
                    }
                } catch (InterruptedException ie) {
                    System.out.println("Interrupted while the map was waiting for " + " the signal.");
                    break;
                }
            }
        }
    }

    /**
     * Mapper that waits till it gets killed.
     */
    static class KillingMapperWithChildren extends MapperWithChildren {
        public void configure(JobConf conf) {
            super.configure(conf);
        }

        public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> out,
                Reporter reporter) throws IOException {

            try {
                while (true) {//just wait till kill happens
                    Thread.sleep(1000);
                }
            } catch (InterruptedException e) {
                LOG.warn("Exception in KillMapperWithChild.map:" + e);
            }
        }
    }

    /**
     * Mapper that fails when recieves a signal. Signal is existence of a file.
     */
    static class FailingMapperWithChildren extends MapperWithChildren {
        public void configure(JobConf conf) {
            super.configure(conf);
        }

        public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> out,
                Reporter reporter) throws IOException {
            while (!fs.exists(signalFile)) {// wait for signal file creation
                try {
                    reporter.progress();
                    synchronized (this) {
                        this.wait(1000);
                    }
                } catch (InterruptedException ie) {
                    System.out.println("Interrupted while the map was waiting for " + " the signal.");
                    break;
                }
            }
            throw new RuntimeException("failing map");
        }
    }

    /**
     * Check for presence of the process with the pid passed is alive or not
     * currently.
     * 
     * @param pid pid of the process
     * @return if a process is alive or not.
     */
    private static boolean isAlive(String pid) throws IOException {
        String commandString = "ps -o pid,command -e";
        String args[] = new String[] { "bash", "-c", commandString };
        ShellCommandExecutor shExec = new ShellCommandExecutor(args);
        try {
            shExec.execute();
        } catch (ExitCodeException e) {
            return false;
        } catch (IOException e) {
            LOG.warn("IOExecption thrown while checking if process is alive" + StringUtils.stringifyException(e));
            throw e;
        }

        String output = shExec.getOutput();

        //Parse the command output and check for pid, ignore the commands
        //which has ps or grep in it.
        StringTokenizer strTok = new StringTokenizer(output, "\n");
        boolean found = false;
        while (strTok.hasMoreTokens()) {
            StringTokenizer pidToken = new StringTokenizer(strTok.nextToken(), " ");
            String pidStr = pidToken.nextToken();
            String commandStr = pidToken.nextToken();
            if (pid.equals(pidStr) && !(commandStr.contains("ps") || commandStr.contains("grep"))) {
                found = true;
                break;
            }
        }
        return found;
    }

}