List of usage examples for java.lang Process destroy
public abstract void destroy();
From source file:com.ah.be.common.NmsUtil.java
public static List<Integer> checkUsingPorts() throws IOException { String cmdForWindows = "netstat -ano"; String cmdForLinux = "netstat -anp"; String os = System.getProperty("os.name"); String[] cmdArray = os.toLowerCase().contains("windows") ? new String[] { "cmd.exe", "/C", cmdForWindows } : new String[] { "bash", "-c", cmdForLinux }; Runtime runtime = Runtime.getRuntime(); Process proc = null; BufferedReader reader = null; log.info("checkUsingPorts", "Executing netstat cmd: " + cmdArray[0] + " " + cmdArray[1] + " " + cmdArray[2]); try {/*from w w w.jav a 2 s .c o m*/ proc = runtime.exec(cmdArray); InputStream in = proc.getInputStream(); if (in == null) { throw new IOException("Could not get the input stream from Process when executing " + cmdArray[0] + " " + cmdArray[1] + " " + cmdArray[2]); } reader = new BufferedReader(new InputStreamReader(in)); List<Integer> usingPorts = new ArrayList<Integer>(); String rawLine; while ((rawLine = reader.readLine()) != null) { rawLine = rawLine.trim().toUpperCase(); if (rawLine.startsWith("TCP") || rawLine.startsWith("UDP")) { for (StringTokenizer token = new StringTokenizer(rawLine, " "); token.hasMoreTokens();) { String field = token.nextToken(); // The string token firstly containing the character of ":" is the "Local Address" // field and the sub-string behind the last ":" is the port number being used. int lastColonIndex = field.lastIndexOf(":"); if (lastColonIndex != -1) { String strPort = null; try { strPort = field.substring(lastColonIndex + 1); int usingPort = Integer.parseInt(strPort); if (!usingPorts.contains(usingPort)) { usingPorts.add(usingPort); } } catch (NumberFormatException nfe) { log.error("checkUsingPorts", strPort + " is not a numeric.", nfe); } break; } } } } Collections.sort(usingPorts); return usingPorts; } finally { if (reader != null) { try { reader.close(); } catch (IOException ioe) { log.error("checkUsingPorts", "I/O Error in closing BufferedReader", ioe); } } if (proc != null) { proc.destroy(); } } }
From source file:org.apache.nifi.processors.standard.ExecuteStreamCommand.java
@Override public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile inputFlowFile = session.get(); if (null == inputFlowFile) { return;/* ww w. ja va 2 s . c o m*/ } final ArrayList<String> args = new ArrayList<>(); final boolean putToAttribute = context.getProperty(PUT_OUTPUT_IN_ATTRIBUTE).isSet(); final Integer attributeSize = context.getProperty(PUT_ATTRIBUTE_MAX_LENGTH).asInteger(); final String attributeName = context.getProperty(PUT_OUTPUT_IN_ATTRIBUTE).getValue(); final String executeCommand = context.getProperty(EXECUTION_COMMAND) .evaluateAttributeExpressions(inputFlowFile).getValue(); args.add(executeCommand); final String commandArguments = context.getProperty(EXECUTION_ARGUMENTS) .evaluateAttributeExpressions(inputFlowFile).getValue(); final boolean ignoreStdin = Boolean.parseBoolean(context.getProperty(IGNORE_STDIN).getValue()); if (!StringUtils.isBlank(commandArguments)) { for (String arg : ArgumentUtils.splitArgs(commandArguments, context.getProperty(ARG_DELIMITER).getValue().charAt(0))) { args.add(arg); } } final String workingDir = context.getProperty(WORKING_DIR).evaluateAttributeExpressions(inputFlowFile) .getValue(); final ProcessBuilder builder = new ProcessBuilder(); logger.debug("Executing and waiting for command {} with arguments {}", new Object[] { executeCommand, commandArguments }); File dir = null; if (!StringUtils.isBlank(workingDir)) { dir = new File(workingDir); if (!dir.exists() && !dir.mkdirs()) { logger.warn("Failed to create working directory {}, using current working directory {}", new Object[] { workingDir, System.getProperty("user.dir") }); } } final Map<String, String> environment = new HashMap<>(); for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) { if (entry.getKey().isDynamic()) { environment.put(entry.getKey().getName(), entry.getValue()); } } builder.environment().putAll(environment); builder.command(args); builder.directory(dir); builder.redirectInput(Redirect.PIPE); builder.redirectOutput(Redirect.PIPE); final Process process; try { process = builder.start(); } catch (IOException e) { logger.error("Could not create external process to run command", e); throw new ProcessException(e); } try (final OutputStream pos = process.getOutputStream(); final InputStream pis = process.getInputStream(); final InputStream pes = process.getErrorStream(); final BufferedInputStream bis = new BufferedInputStream(pis); final BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(pes))) { int exitCode = -1; final BufferedOutputStream bos = new BufferedOutputStream(pos); FlowFile outputFlowFile = putToAttribute ? inputFlowFile : session.create(inputFlowFile); ProcessStreamWriterCallback callback = new ProcessStreamWriterCallback(ignoreStdin, bos, bis, logger, attributeName, session, outputFlowFile, process, putToAttribute, attributeSize); session.read(inputFlowFile, callback); outputFlowFile = callback.outputFlowFile; if (putToAttribute) { outputFlowFile = session.putAttribute(outputFlowFile, attributeName, new String(callback.outputBuffer, 0, callback.size)); } exitCode = callback.exitCode; logger.debug("Execution complete for command: {}. Exited with code: {}", new Object[] { executeCommand, exitCode }); Map<String, String> attributes = new HashMap<>(); final StringBuilder strBldr = new StringBuilder(); try { String line; while ((line = bufferedReader.readLine()) != null) { strBldr.append(line).append("\n"); } } catch (IOException e) { strBldr.append("Unknown...could not read Process's Std Error"); } int length = strBldr.length() > 4000 ? 4000 : strBldr.length(); attributes.put("execution.error", strBldr.substring(0, length)); final Relationship outputFlowFileRelationship = putToAttribute ? ORIGINAL_RELATIONSHIP : OUTPUT_STREAM_RELATIONSHIP; if (exitCode == 0) { logger.info("Transferring flow file {} to {}", new Object[] { outputFlowFile, outputFlowFileRelationship.getName() }); } else { logger.error("Transferring flow file {} to {}. Executable command {} ended in an error: {}", new Object[] { outputFlowFile, outputFlowFileRelationship.getName(), executeCommand, strBldr.toString() }); } attributes.put("execution.status", Integer.toString(exitCode)); attributes.put("execution.command", executeCommand); attributes.put("execution.command.args", commandArguments); outputFlowFile = session.putAllAttributes(outputFlowFile, attributes); // This transfer will transfer the FlowFile that received the stream out put to it's destined relationship. // In the event the stream is put to the an attribute of the original, it will be transferred here. session.transfer(outputFlowFile, outputFlowFileRelationship); if (!putToAttribute) { logger.info("Transferring flow file {} to original", new Object[] { inputFlowFile }); inputFlowFile = session.putAllAttributes(inputFlowFile, attributes); session.transfer(inputFlowFile, ORIGINAL_RELATIONSHIP); } } catch (final IOException ex) { // could not close Process related streams logger.warn("Problem terminating Process {}", new Object[] { process }, ex); } finally { process.destroy(); // last ditch effort to clean up that process. } }
From source file:org.apache.ambari.server.bootstrap.BSRunner.java
@Override public void run() { if (sshHostInfo.getSshKey() == null || sshHostInfo.getSshKey().equals("")) { beforeBootStrap(sshHostInfo);//from w ww .ja v a 2 s. co m } String hostString = createHostString(sshHostInfo.getHosts()); String user = sshHostInfo.getUser(); String userRunAs = sshHostInfo.getUserRunAs(); if (user == null || user.isEmpty()) { user = DEFAULT_USER; } String command[] = new String[12]; BSStat stat = BSStat.RUNNING; String scriptlog = ""; try { createRunDir(); if (LOG.isDebugEnabled()) { // FIXME needs to be removed later // security hole LOG.debug("Using ssh key=\"" + sshHostInfo.getSshKey() + "\""); } String password = sshHostInfo.getPassword(); if (password != null && !password.isEmpty()) { this.passwordFile = new File(this.requestIdDir, "host_pass"); // TODO : line separator should be changed // if we are going to support multi platform server-agent solution String lineSeparator = System.getProperty("line.separator"); password = password + lineSeparator; writePasswordFile(password); } writeSshKeyFile(sshHostInfo.getSshKey()); /* Running command: * script hostlist bsdir user sshkeyfile */ command[0] = this.bsScript; command[1] = hostString; command[2] = this.requestIdDir.toString(); command[3] = user; command[4] = this.sshKeyFile.toString(); command[5] = this.agentSetupScript.toString(); command[6] = this.ambariHostname; command[7] = this.clusterOsFamily; command[8] = this.projectVersion; command[9] = this.serverPort + ""; command[10] = userRunAs; command[11] = (this.passwordFile == null) ? "null" : this.passwordFile.toString(); LOG.info("Host= " + hostString + " bs=" + this.bsScript + " requestDir=" + requestIdDir + " user=" + user + " keyfile=" + this.sshKeyFile + " passwordFile " + this.passwordFile + " server=" + this.ambariHostname + " version=" + projectVersion + " serverPort=" + this.serverPort + " userRunAs=" + userRunAs); String[] env = new String[] { "AMBARI_PASSPHRASE=" + agentSetupPassword }; if (this.verbose) env = new String[] { env[0], " BS_VERBOSE=\"-vvv\" " }; if (LOG.isDebugEnabled()) { LOG.debug(Arrays.toString(command)); } String bootStrapOutputFilePath = requestIdDir + File.separator + "bootstrap.out"; String bootStrapErrorFilePath = requestIdDir + File.separator + "bootstrap.err"; Process process = Runtime.getRuntime().exec(command, env); PrintWriter stdOutWriter = null; PrintWriter stdErrWriter = null; try { stdOutWriter = new PrintWriter(bootStrapOutputFilePath); stdErrWriter = new PrintWriter(bootStrapErrorFilePath); IOUtils.copy(process.getInputStream(), stdOutWriter); IOUtils.copy(process.getErrorStream(), stdErrWriter); } finally { if (stdOutWriter != null) stdOutWriter.close(); if (stdErrWriter != null) stdErrWriter.close(); } // Startup a scheduled executor service to look through the logs ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); BSStatusCollector statusCollector = new BSStatusCollector(); ScheduledFuture<?> handle = scheduler.scheduleWithFixedDelay(statusCollector, 0, 10, TimeUnit.SECONDS); LOG.info("Kicking off the scheduler for polling on logs in " + this.requestIdDir); try { LOG.info("Bootstrap output, log=" + bootStrapErrorFilePath + " " + bootStrapOutputFilePath); int exitCode = process.waitFor(); String outMesg = ""; String errMesg = ""; try { outMesg = FileUtils.readFileToString(new File(bootStrapOutputFilePath)); errMesg = FileUtils.readFileToString(new File(bootStrapErrorFilePath)); } catch (IOException io) { LOG.info("Error in reading files ", io); } scriptlog = outMesg + "\n\n" + errMesg; LOG.info("Script log Mesg " + scriptlog); if (exitCode != 0) { stat = BSStat.ERROR; } else { stat = BSStat.SUCCESS; } scheduler.schedule(new BSStatusCollector(), 0, TimeUnit.SECONDS); long startTime = System.currentTimeMillis(); while (true) { if (LOG.isDebugEnabled()) { LOG.debug("Waiting for hosts status to be updated"); } boolean pendingHosts = false; BootStrapStatus tmpStatus = bsImpl.getStatus(requestId); List<BSHostStatus> hostStatusList = tmpStatus.getHostsStatus(); if (hostStatusList != null) { for (BSHostStatus status : hostStatusList) { if (status.getStatus().equals("RUNNING")) { pendingHosts = true; } } } else { //Failed to get host status, waiting for hosts status to be updated pendingHosts = true; } if (LOG.isDebugEnabled()) { LOG.debug("Whether hosts status yet to be updated, pending=" + pendingHosts); } if (!pendingHosts) { break; } try { Thread.sleep(1000); } catch (InterruptedException e) { // continue } long now = System.currentTimeMillis(); if (now >= (startTime + 15000)) { LOG.warn("Gave up waiting for hosts status to be updated"); break; } } } catch (InterruptedException e) { throw new IOException(e); } finally { handle.cancel(true); /* schedule a last update */ scheduler.schedule(new BSStatusCollector(), 0, TimeUnit.SECONDS); scheduler.shutdownNow(); try { scheduler.awaitTermination(10, TimeUnit.SECONDS); } catch (InterruptedException e) { LOG.info("Interruped while waiting for scheduler"); } process.destroy(); } } catch (IOException io) { LOG.info("Error executing bootstrap " + io.getMessage()); stat = BSStat.ERROR; } finally { /* get the bstatus */ BootStrapStatus tmpStatus = bsImpl.getStatus(requestId); List<BSHostStatus> hostStatusList = tmpStatus.getHostsStatus(); if (hostStatusList != null) { for (BSHostStatus hostStatus : hostStatusList) { if ("FAILED".equals(hostStatus.getStatus())) { stat = BSStat.ERROR; break; } } } else { stat = BSStat.ERROR; } tmpStatus.setLog(scriptlog); tmpStatus.setStatus(stat); bsImpl.updateStatus(requestId, tmpStatus); bsImpl.reset(); // Remove private ssh key after bootstrap is complete try { FileUtils.forceDelete(sshKeyFile); } catch (IOException io) { LOG.warn(io.getMessage()); } if (passwordFile != null) { // Remove password file after bootstrap is complete try { FileUtils.forceDelete(passwordFile); } catch (IOException io) { LOG.warn(io.getMessage()); } } finished(); } }
From source file:org.apache.flink.test.recovery.AbstractProcessFailureRecoveryTest.java
@Test public void testTaskManagerProcessFailure() { final StringWriter processOutput1 = new StringWriter(); final StringWriter processOutput2 = new StringWriter(); final StringWriter processOutput3 = new StringWriter(); ActorSystem jmActorSystem = null;/*from w w w. j a v a 2 s.c o m*/ Process taskManagerProcess1 = null; Process taskManagerProcess2 = null; Process taskManagerProcess3 = null; File coordinateTempDir = null; try { // check that we run this test only if the java command // is available on this machine String javaCommand = getJavaCommandPath(); if (javaCommand == null) { System.out.println("---- Skipping Process Failure test : Could not find java executable ----"); return; } // create a logging file for the process File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties"); tempLogFile.deleteOnExit(); CommonTestUtils.printLog4jDebugConfig(tempLogFile); // coordination between the processes goes through a directory coordinateTempDir = createTempDirectory(); // find a free port to start the JobManager final int jobManagerPort = NetUtils.getAvailablePort(); // start a JobManager Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort); Configuration jmConfig = new Configuration(); jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms"); jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s"); jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9); jmConfig.setString(ConfigConstants.DEFAULT_EXECUTION_RETRY_DELAY_KEY, "10 s"); jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<Tuple2<String, Object>>(localAddress)); ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, StreamingMode.STREAMING) ._1(); // the TaskManager java command String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) }; // start the first two TaskManager processes taskManagerProcess1 = new ProcessBuilder(command).start(); new PipeForwarder(taskManagerProcess1.getErrorStream(), processOutput1); taskManagerProcess2 = new ProcessBuilder(command).start(); new PipeForwarder(taskManagerProcess2.getErrorStream(), processOutput2); // we wait for the JobManager to have the two TaskManagers available // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes) waitUntilNumTaskManagersAreRegistered(jmActor, 2, 120000); // the program will set a marker file in each of its parallel tasks once they are ready, so that // this coordinating code is aware of this. // the program will very slowly consume elements until the marker file (later created by the // test driver code) is present final File coordinateDirClosure = coordinateTempDir; final Throwable[] errorRef = new Throwable[1]; // we trigger program execution in a separate thread Thread programTrigger = new Thread("Program Trigger") { @Override public void run() { try { testProgram(jobManagerPort, coordinateDirClosure); } catch (Throwable t) { t.printStackTrace(); errorRef[0] = t; } } }; //start the test program programTrigger.start(); // wait until all marker files are in place, indicating that all tasks have started // max 20 seconds waitForMarkerFiles(coordinateTempDir, PARALLELISM, 20000); // start the third TaskManager taskManagerProcess3 = new ProcessBuilder(command).start(); new PipeForwarder(taskManagerProcess3.getErrorStream(), processOutput3); // we wait for the third TaskManager to register // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes) waitUntilNumTaskManagersAreRegistered(jmActor, 3, 120000); // kill one of the previous TaskManagers, triggering a failure and recovery taskManagerProcess1.destroy(); taskManagerProcess1 = null; // we create the marker file which signals the program functions tasks that they can complete touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE)); // wait for at most 5 minutes for the program to complete programTrigger.join(300000); // check that the program really finished assertFalse("The program did not finish in time", programTrigger.isAlive()); // check whether the program encountered an error if (errorRef[0] != null) { Throwable error = errorRef[0]; error.printStackTrace(); fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage()); } // all seems well :-) } catch (Exception e) { e.printStackTrace(); printProcessLog("TaskManager 1", processOutput1.toString()); printProcessLog("TaskManager 2", processOutput2.toString()); printProcessLog("TaskManager 3", processOutput3.toString()); fail(e.getMessage()); } catch (Error e) { e.printStackTrace(); printProcessLog("TaskManager 1", processOutput1.toString()); printProcessLog("TaskManager 2", processOutput2.toString()); printProcessLog("TaskManager 3", processOutput3.toString()); throw e; } finally { if (taskManagerProcess1 != null) { taskManagerProcess1.destroy(); } if (taskManagerProcess2 != null) { taskManagerProcess2.destroy(); } if (taskManagerProcess3 != null) { taskManagerProcess3.destroy(); } if (jmActorSystem != null) { jmActorSystem.shutdown(); } if (coordinateTempDir != null) { try { FileUtils.deleteDirectory(coordinateTempDir); } catch (Throwable t) { // we can ignore this } } } }
From source file:com.twosigma.beaker.core.rest.PluginServiceLocatorRest.java
/** * locatePluginService//www .ja v a2 s .com * locate the service that matches the passed-in information about a service and return the * base URL the client can use to connect to the target plugin service. If such service * doesn't exist, this implementation will also start the service. * * @param pluginId * @param command name of the starting script * @param nginxRules rules to help setup nginx proxying * @param startedIndicator string indicating that the plugin has started * @param startedIndicatorStream stream to search for indicator, null defaults to stdout * @param recordOutput boolean, record out/err streams to output log service or not, null defaults * to false * @param waitfor if record output log service is used, string to wait for before logging starts * @return the base url of the service * @throws InterruptedException * @throws IOException */ @GET @Path("/{plugin-id}") @Produces(MediaType.TEXT_PLAIN) public Response locatePluginService(@PathParam("plugin-id") String pluginId, @QueryParam("command") String command, @QueryParam("nginxRules") @DefaultValue("rest") String nginxRules, @QueryParam("startedIndicator") String startedIndicator, @QueryParam("startedIndicatorStream") @DefaultValue("stdout") String startedIndicatorStream, @QueryParam("recordOutput") @DefaultValue("false") boolean recordOutput, @QueryParam("waitfor") String waitfor) throws InterruptedException, IOException, ExecutionException { PluginConfig pConfig = this.plugins.get(pluginId); if (pConfig != null && pConfig.isStarted()) { logger.info("plugin service " + pluginId + " already started at" + pConfig.getBaseUrl()); return buildResponse(pConfig.getBaseUrl(), false); } String password = RandomStringUtils.random(40, true, true); Process proc = null; String restartId = ""; /* * Only one plugin can be started at a given time since we need to find a free port. * We serialize starting of plugins and we parallelize nginx configuration reload with the actual plugin * evaluator start. */ synchronized (this) { // find a port to use for proxypass between nginx and the plugin final int port = getNextAvailablePort(this.portSearchStart); final String baseUrl = generatePrefixedRandomString(pluginId, 12).replaceAll("[\\s]", ""); pConfig = new PluginConfig(port, nginxRules, baseUrl, password); this.portSearchStart = pConfig.port + 1; this.plugins.put(pluginId, pConfig); if (nginxRules.startsWith("ipython")) { generateIPythonConfig(pluginId, port, password, command); if (isIPython4OrNewer(getIPythonVersion(pluginId, command))) { new JupyterWidgetsExtensionProcessor(pluginId, this.pluginDir).copyJupyterExtensionIfExists(); } } // reload nginx config restartId = generateNginxConfig(); Process restartproc = Runtime.getRuntime().exec(this.nginxRestartCommand, this.nginxEnv); startGobblers(restartproc, "restart-nginx-" + pluginId, null, null); restartproc.waitFor(); ArrayList<String> fullCommand = new ArrayList<String>(Arrays.asList(command.split("\\s+"))); String args; fullCommand.set(0, (this.pluginLocations.containsKey(pluginId) ? this.pluginLocations.get(pluginId) : this.pluginDir) + "/" + fullCommand.get(0)); if (Files.notExists(Paths.get(fullCommand.get(0)))) { throw new PluginServiceNotFoundException("plugin service " + pluginId + " not found at " + command); } List<String> extraArgs = this.pluginArgs.get(pluginId); if (extraArgs != null) { fullCommand.addAll(extraArgs); } fullCommand.add(Integer.toString(pConfig.port)); String[] env = buildEnv(pluginId, password); if (windows()) { String python = this.config.getInstallDirectory() + "\\python\\python"; fullCommand.add(0, python); } logger.info("Running"); for (int i = 0; i < fullCommand.size(); i++) { logger.info(i + ": " + fullCommand.get(i)); } proc = Runtime.getRuntime().exec(listToArray(fullCommand), env); } if (startedIndicator != null && !startedIndicator.isEmpty()) { InputStream is = startedIndicatorStream.equals("stderr") ? proc.getErrorStream() : proc.getInputStream(); InputStreamReader ir = new InputStreamReader(is); BufferedReader br = new BufferedReader(ir); String line = ""; while ((line = br.readLine()) != null) { logger.info("looking on " + startedIndicatorStream + " found:" + line); if (line.indexOf(startedIndicator) >= 0) { logger.info("Acknowledge " + pluginId + " plugin started due to " + startedIndicator); break; } } if (null == line) { throw new PluginServiceNotFoundException("plugin service: " + pluginId + " failed to start"); } } startGobblers(proc, pluginId, recordOutput ? this.outputLogService : null, waitfor); // check that nginx did actually restart String url = "http://127.0.0.1:" + this.restartPort + "/restart." + restartId + "/present.html"; try { spinCheck(url); } catch (Throwable t) { logger.warn("time out plugin = {}", pluginId); this.plugins.remove(pluginId); if (windows()) { new WinProcess(proc).killRecursively(); } else { proc.destroy(); // send SIGTERM } throw new NginxRestartFailedException( "nginx restart failed.\n" + "url=" + url + "\n" + "message=" + t.getMessage()); } pConfig.setProcess(proc); logger.info("Done starting " + pluginId); return buildResponse(pConfig.getBaseUrl(), true); }
From source file:com.att.android.arodatacollector.main.AROCollectorService.java
/** * This method creates a SU enabled shell Sets the execute permission for * tcpdump and key.db Starts the tcpdump on Completion or abnormal * termination of tcpdump Shell is destroyed * /*from ww w. j ava 2 s .co m*/ * @throws IOException * @throws InterruptedException */ private void startTcpDump() throws IOException, InterruptedException { Log.d(TAG, "inside startTcpDump at timestamp " + System.currentTimeMillis()); Process sh = null; DataOutputStream os = null; int shExitValue = 0; try { startCalTime = Calendar.getInstance(); if (!AROCollectorUtils.isTcpDumpRunning()) { //only start tcpdump if it's not already running, to handle the case where the background //service was stopped and now restarting Log.i(TAG, "tcpdump is not running. Starting tcpdump in the shell now"); sh = Runtime.getRuntime().exec("su"); os = new DataOutputStream(sh.getOutputStream()); String Command = "chmod 777 " + ARODataCollector.INTERNAL_DATA_PATH + TCPDUMPFILENAME + "\n"; os.writeBytes(Command); Command = "chmod 777 " + ARODataCollector.INTERNAL_DATA_PATH + "key.db" + "\n"; os.writeBytes(Command); //flurry timed event duration mApp.writeToFlurryAndLogEvent(flurryTimedEvent, "Flurry trace start", startCalTime.getTime().toString(), "Trace Duration", true); /*Command = "." + ARODataCollector.INTERNAL_DATA_PATH + TCPDUMPFILENAME + " -w " + TRACE_FOLDERNAME + "\n";*/ Command = "." + ARODataCollector.INTERNAL_DATA_PATH + TCPDUMPFILENAME + " -i any -w " + TRACE_FOLDERNAME + "\n"; os.writeBytes(Command); Command = "exit\n"; os.writeBytes(Command); os.flush(); StreamClearer stdoutClearer = new StreamClearer(sh.getInputStream(), "stdout", true); new Thread(stdoutClearer).start(); StreamClearer stderrClearer = new StreamClearer(sh.getErrorStream(), "stderr", true); new Thread(stderrClearer).start(); shExitValue = sh.waitFor(); if (DEBUG) { Log.i(TAG, "tcpdump waitFor returns exit value: " + shExitValue + " at " + System.currentTimeMillis()); } } else { Log.i(TAG, "timestamp " + System.currentTimeMillis() + ": tcpdump is already running"); } //We will continue and block the thread untill we see valid instance of tcpdump running in shell //waitFor() does not seems to be working on ICS firmware while (AROCollectorUtils.isTcpDumpRunning()) { continue; } if (DEBUG) { Log.d(TAG, "tcpdump process exit value: " + shExitValue); Log.i(TAG, "Coming out of startTcpDump at " + System.currentTimeMillis()); logTcpdumpPid(); } // Stopping the Video capture right after tcpdump coming out of // shell new Thread(new Runnable() { @Override public void run() { if (mVideoRecording && mApp.getAROVideoCaptureRunningFlag()) { stopScreenVideoCapture(); stopDmesg(); } } }).start(); final Calendar endCalTime = Calendar.getInstance(); FlurryAgent.endTimedEvent("Trace Duration"); mApp.writeToFlurry(flurryTimedEvent, "Flurry trace end", endCalTime.getTime().toString(), "flurryTimedEvent", AROCollectorUtils.NOT_APPLICABLE, AROCollectorUtils.EMPTY_STRING); mApp.writeToFlurry(flurryTimedEvent, "calculated Flurry trace duration", getUpTime(endCalTime), "flurryTimedEvent", AROCollectorUtils.NOT_APPLICABLE, AROCollectorUtils.EMPTY_STRING); logFlurryEvents(); DataCollectorTraceStop(); } finally { try { mApp.setTcpDumpStartFlag(false); if (os != null) { os.close(); } if (sh != null) { sh.destroy(); } } catch (Exception e) { Log.e(TAG, "exception in startTcpDump DataOutputStream close", e); } } }
From source file:com.ikanow.infinit.e.application.handlers.polls.LogstashTestRequestPollHandler.java
@Override public void performPoll() { if (null == LOGSTASH_DIRECTORY) { // (static memory not yet initialized) try {/*from ww w. j ava2 s.c o m*/ Thread.sleep(1000); // (extend the sleep time a bit) } catch (Exception e) { } return; } // 1] Check - does logstash exist on this server: File logstashBinary = new File(LOGSTASH_BINARY); if (!logstashBinary.canExecute()) { try { Thread.sleep(10000); // (extend the sleep time a bit) } catch (Exception e) { } return; } // 2] (Unlike harvester, _don't_ grab an application token, you can run this on as many servers as you want) // 3] Setup if (null == _logHarvesterQ) { _logHarvesterQ = new MongoQueue(DbManager.getIngest().getLogHarvesterQ().getDB().getName(), DbManager.getIngest().getLogHarvesterQ().getName()); } if (null == _testOutputTemplate) { try { File testOutputTemplate = new File(LOGSTASH_TEST_OUTPUT_TEMPLATE); InputStream inStream = null; try { inStream = new FileInputStream(testOutputTemplate); _testOutputTemplate = IOUtils.toString(inStream); } catch (Exception e) {// abandon ship! return; } finally { inStream.close(); } } catch (Exception e) {// abandon ship! //DEBUG //e.printStackTrace(); return; } } //TESTED // 4] Check if any new requests have been made: BasicDBObject queueQuery = new BasicDBObject("logstash", new BasicDBObject(DbManager.exists_, true)); DBObject nextElement = _logHarvesterQ.pop(queueQuery); while (nextElement != null) { //DEBUG //System.out.println("FOUND: " + nextElement.toString()); TestLogstashExtractorPojo testInfo = TestLogstashExtractorPojo.fromDb(nextElement, TestLogstashExtractorPojo.class); if ((null == testInfo.maxDocs) || (null == testInfo.logstash.config) || (null == testInfo.isAdmin) || (null == testInfo.sourceKey)) { TestLogstashExtractorPojo testErr = new TestLogstashExtractorPojo(); testErr._id = testInfo._id; testErr.error = "Internal Logic Error. Missing one of: maxDocs, isAdmin, sourceKey, logstash.config"; _logHarvesterQ.push(testErr.toDb()); return; } //TESTED // Validate/tranform the configuration: StringBuffer errMessage = new StringBuffer(); String logstashConfig = LogstashConfigUtils.validateLogstashInput(testInfo.sourceKey, testInfo.logstash.config, errMessage, testInfo.isAdmin); if (null == logstashConfig) { // Validation error... TestLogstashExtractorPojo testErr = new TestLogstashExtractorPojo(); testErr._id = testInfo._id; testErr.error = "Validation error: " + errMessage.toString(); _logHarvesterQ.push(testErr.toDb()); return; } //TESTED // Replacement for #LOGSTASH{host} - currently only replacement supported (+ #IKANOW{} in main code) try { logstashConfig = logstashConfig.replace("#LOGSTASH{host}", java.net.InetAddress.getLocalHost().getHostName()); } catch (Exception e) { logstashConfig = logstashConfig.replace("#LOGSTASH{host}", "localhost.localdomain"); } //TESTED String outputConf = _testOutputTemplate.replace("_XXX_COLLECTION_XXX_", testInfo._id.toString()); //TESTED String sinceDbPath = LOGSTASH_WD + ".sincedb_" + testInfo._id.toString(); String conf = logstashConfig.replace("_XXX_DOTSINCEDB_XXX_", sinceDbPath) + outputConf.replace("_XXX_SOURCEKEY_XXX_", testInfo.sourceKey); boolean allWorked = false; Process logstashProcess = null; try { // 1] Create the process ArrayList<String> args = new ArrayList<String>(4); args.addAll(Arrays.asList(LOGSTASH_BINARY, "-e", conf)); if (0 == testInfo.maxDocs) { args.add("-t"); // test mode, must faster } //TESTED if ((null != testInfo.logstash.testDebugOutput) && testInfo.logstash.testDebugOutput) { args.add("--debug"); } else { args.add("--verbose"); } ProcessBuilder logstashProcessBuilder = new ProcessBuilder(args); logstashProcessBuilder = logstashProcessBuilder.directory(new File(LOGSTASH_WD)) .redirectErrorStream(true); logstashProcessBuilder.environment().put("JAVA_OPTS", ""); //DEBUG //System.out.println("STARTING: " + ArrayUtils.toString(logstashProcessBuilder.command().toArray())); // 2] Kick off the process logstashProcess = logstashProcessBuilder.start(); StringWriter outputAndError = new StringWriter(); OutputCollector outAndErrorStream = new OutputCollector(logstashProcess.getInputStream(), new PrintWriter(outputAndError)); outAndErrorStream.start(); final int toWait_s = 240; boolean exited = false; // 3] Check the output collection for records int errorVal = 0; long priorCount = 0L; int priorLogCount = 0; int timeOfLastLoggingChange = 0; int timeOfLastDocCountChange = 0; String reasonForExit = ""; int inactivityTimeout_s = 10; // (default) if (null != testInfo.logstash.testInactivityTimeout_secs) { inactivityTimeout_s = testInfo.logstash.testInactivityTimeout_secs; } for (int i = 0; i < toWait_s; i += 5) { try { Thread.sleep(5000); } catch (Exception e) { } long count = DbManager.getCollection("ingest", testInfo._id.toString()).count(); // 3.1] Do we have all the records (or is the number staying static) //DEBUG //System.out.println("FOUND: " + count + " VS " + priorCount + " , " + priorPriorCount); // 3.1a] All done? if ((count >= testInfo.maxDocs) && (count > 0)) { allWorked = true; break; } //TESTED // 3.1b] If not, has anything changes? if (priorCount != count) { timeOfLastDocCountChange = i; } if (priorLogCount != outAndErrorStream.getLines()) { timeOfLastLoggingChange = i; } // 3.1c] Check for inactivity if ((timeOfLastDocCountChange > 0) && (i - timeOfLastDocCountChange) >= inactivityTimeout_s) { // Delay between events: treat as success allWorked = true; break; } //TESTED if ((0 == count) && outAndErrorStream.getPipelineStarted() && ((timeOfLastLoggingChange > 0) && (i - timeOfLastLoggingChange) >= inactivityTimeout_s)) { // Delay between log messages after pipeline started, no documents, treat as failure //DEBUG //System.out.println("LOG LINES! " + i + " NUM = " + outAndErrorStream.getLines()); errorVal = 1; reasonForExit = "No records received and logging inactive.\n"; break; } //TESTED // 3.2] Has the process exited unexpectedly? try { errorVal = logstashProcess.exitValue(); reasonForExit = "Logstash process exited with error: " + errorVal + ".\n"; exited = true; //DEBUG //System.out.println("GOT EXIT VALUE: " + errorVal); break; } //TESTED catch (Exception e) { } // that's OK we're just still going is all... priorCount = count; priorLogCount = outAndErrorStream.getLines(); } //(end loop while waiting for job to complete) // 4] If the process is still running then kill it if (!exited) { //DEBUG //System.out.println("EXITED WITHOUT FINISHING"); logstashProcess.destroy(); } //TESTED // 5] Things to do when the job is done: (worked or not) // Send a message to the harvester outAndErrorStream.join(); // (if we're here then must have closed the process, wait for it to die) TestLogstashExtractorPojo testErr = new TestLogstashExtractorPojo(); testErr._id = testInfo._id; if ((testInfo.maxDocs > 0) || (0 != errorVal)) { testErr.error = reasonForExit + outputAndError.toString(); // (note this is capped at well below the BSON limit in the thread below) } else { // maxDocs==0 (ie pre-publish test) AND no error returned testErr.error = null; } _logHarvesterQ.push(testErr.toDb()); //TESTED } catch (Exception e) { //DEBUG //e.printStackTrace(); TestLogstashExtractorPojo testErr = new TestLogstashExtractorPojo(); testErr._id = testInfo._id; testErr.error = "Internal Logic Error: " + e.getMessage(); _logHarvesterQ.push(testErr.toDb()); } //TOTEST finally { // If we created a sincedb path then remove it: try { new File(sinceDbPath).delete(); } catch (Exception e) { } // (don't care if it fails) if (!allWorked) { // (otherwise up to the harvester to remove these) try { DbManager.getCollection("ingest", testInfo._id.toString()).drop(); } catch (Exception e) { } // doesn't matter if this errors } try { // Really really want to make sure the process isn't running if (null != logstashProcess) { logstashProcess.destroy(); } } catch (Exception e) { } catch (Error ee) { } } //TESTED // (If we actually processed an element, then try again immediate) nextElement = _logHarvesterQ.pop(queueQuery); } }
From source file:org.apache.flink.test.recovery.AbstractTaskManagerProcessFailureRecoveryTest.java
@Test public void testTaskManagerProcessFailure() { final StringWriter processOutput1 = new StringWriter(); final StringWriter processOutput2 = new StringWriter(); final StringWriter processOutput3 = new StringWriter(); ActorSystem jmActorSystem = null;/*from w w w.ja v a 2 s.com*/ Process taskManagerProcess1 = null; Process taskManagerProcess2 = null; Process taskManagerProcess3 = null; File coordinateTempDir = null; try { // check that we run this test only if the java command // is available on this machine String javaCommand = getJavaCommandPath(); if (javaCommand == null) { System.out.println("---- Skipping Process Failure test : Could not find java executable ----"); return; } // create a logging file for the process File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties"); tempLogFile.deleteOnExit(); CommonTestUtils.printLog4jDebugConfig(tempLogFile); // coordination between the processes goes through a directory coordinateTempDir = CommonTestUtils.createTempDirectory(); // find a free port to start the JobManager final int jobManagerPort = NetUtils.getAvailablePort(); // start a JobManager Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort); Configuration jmConfig = new Configuration(); jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms"); jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s"); jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9); jmConfig.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "10 s"); jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s"); jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress)); ActorRef jmActor = JobManager .startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class) ._1(); // the TaskManager java command String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) }; // start the first two TaskManager processes taskManagerProcess1 = new ProcessBuilder(command).start(); new CommonTestUtils.PipeForwarder(taskManagerProcess1.getErrorStream(), processOutput1); taskManagerProcess2 = new ProcessBuilder(command).start(); new CommonTestUtils.PipeForwarder(taskManagerProcess2.getErrorStream(), processOutput2); // we wait for the JobManager to have the two TaskManagers available // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes) waitUntilNumTaskManagersAreRegistered(jmActor, 2, 120000); // the program will set a marker file in each of its parallel tasks once they are ready, so that // this coordinating code is aware of this. // the program will very slowly consume elements until the marker file (later created by the // test driver code) is present final File coordinateDirClosure = coordinateTempDir; final AtomicReference<Throwable> errorRef = new AtomicReference<>(); // we trigger program execution in a separate thread Thread programTrigger = new Thread("Program Trigger") { @Override public void run() { try { testTaskManagerFailure(jobManagerPort, coordinateDirClosure); } catch (Throwable t) { t.printStackTrace(); errorRef.set(t); } } }; //start the test program programTrigger.start(); // wait until all marker files are in place, indicating that all tasks have started // max 20 seconds if (!waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, 120000)) { // check if the program failed for some reason if (errorRef.get() != null) { Throwable error = errorRef.get(); error.printStackTrace(); fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage()); } else { // no error occurred, simply a timeout fail("The tasks were not started within time (" + 120000 + "msecs)"); } } // start the third TaskManager taskManagerProcess3 = new ProcessBuilder(command).start(); new CommonTestUtils.PipeForwarder(taskManagerProcess3.getErrorStream(), processOutput3); // we wait for the third TaskManager to register // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes) waitUntilNumTaskManagersAreRegistered(jmActor, 3, 120000); // kill one of the previous TaskManagers, triggering a failure and recovery taskManagerProcess1.destroy(); taskManagerProcess1 = null; // we create the marker file which signals the program functions tasks that they can complete touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE)); // wait for at most 5 minutes for the program to complete programTrigger.join(300000); // check that the program really finished assertFalse("The program did not finish in time", programTrigger.isAlive()); // check whether the program encountered an error if (errorRef.get() != null) { Throwable error = errorRef.get(); error.printStackTrace(); fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage()); } // all seems well :-) } catch (Exception e) { e.printStackTrace(); printProcessLog("TaskManager 1", processOutput1.toString()); printProcessLog("TaskManager 2", processOutput2.toString()); printProcessLog("TaskManager 3", processOutput3.toString()); fail(e.getMessage()); } catch (Error e) { e.printStackTrace(); printProcessLog("TaskManager 1", processOutput1.toString()); printProcessLog("TaskManager 2", processOutput2.toString()); printProcessLog("TaskManager 3", processOutput3.toString()); throw e; } finally { if (taskManagerProcess1 != null) { taskManagerProcess1.destroy(); } if (taskManagerProcess2 != null) { taskManagerProcess2.destroy(); } if (taskManagerProcess3 != null) { taskManagerProcess3.destroy(); } if (jmActorSystem != null) { jmActorSystem.shutdown(); } if (coordinateTempDir != null) { try { FileUtils.deleteDirectory(coordinateTempDir); } catch (Throwable t) { // we can ignore this } } } }
From source file:org.kepler.ssh.LocalExec.java
public int executeCmd(String command, OutputStream streamOut, OutputStream streamErr, String thirdPartyTarget) throws ExecException { _commandArr[_commandCount] = command; Runtime rt = Runtime.getRuntime(); Process proc; // get the pwd/passphrase to the third party (and perform authentication // if not yet done) String pwd = SshSession.getPwdToThirdParty(thirdPartyTarget); try {// www .ja v a2 s.com proc = rt.exec(_commandArr); } catch (Exception ex) { //ex.printStackTrace(); throw new ExecException("Cannot execute cmd ** : " + _commandArr[_commandCount] + ex); } // System.out.println("%%% Process started"); // the streams from the process: stdout and stderr BufferedReader out_in = new BufferedReader(new InputStreamReader(proc.getInputStream())); // stdout BufferedReader err_in = new BufferedReader(new InputStreamReader(proc.getErrorStream())); // stderr // the streams towards the caller: stdout and stderr BufferedWriter out_out = new BufferedWriter(new OutputStreamWriter(streamOut)); BufferedWriter err_out = new BufferedWriter(new OutputStreamWriter(streamErr)); BufferedWriter proc_in = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream())); // stdin String line; // Temp for each line of output. int exitVal = -32766; boolean readOut = true; boolean readErr = true; boolean finished = false; boolean checkForPwd = (pwd != null); char c[] = new char[256]; int charsRead; // variables for the timeout checking long start = System.currentTimeMillis(); long current = 0; long maxtime = timeout * 1000L; while (!finished) { // will stop when the process terminates or after // timeout // check the status of the process try { exitVal = proc.exitValue(); finished = true; // process terminated so exit this loop after // reading the buffers } catch (IllegalThreadStateException ex) { // process not yet terminated so we go further } // read stdout if (readOut) { try { while (out_in.ready()) { charsRead = out_in.read(c, 0, 256); out_out.write(c, 0, charsRead); // System.out.println("%%% "+ new String(c, 0, // charsRead)); /* * try { proc_in.write("Anyadat\n", 0, 8); // send the * password proc_in.flush(); } catch (Exception ex) { * System.out.println("### "+ex); * * } */ if (checkForPwd && containsPasswordRequest(c, 0, charsRead)) { // System.out.println("%%% Found password request"); out_out.flush(); // so you may see the request on // stdout already proc_in.write(pwd + "\n", 0, pwd.length() + 1); // send // the // password proc_in.flush(); log.info("Sent password to third party."); checkForPwd = false; // even if it's wrong, do not // do it again } if (timeoutRestartOnStdout) start = System.currentTimeMillis(); // restart // timeout timer } } catch (IOException ioe) { log.error("<IOException> when reading the stdout: " + ioe + "</IOException>"); readOut = false; } } // read stderr if (readErr) { try { while (err_in.ready()) { charsRead = err_in.read(c, 0, 256); err_out.write(c, 0, charsRead); System.out.println("### " + new String(c, 0, charsRead)); if (checkForPwd && containsPasswordRequest(c, 0, charsRead)) { System.out.println("### Found password request"); out_out.flush(); // so you may see the request on // stdout already proc_in.write(pwd + "\n", 0, pwd.length() + 1); // send // the // password proc_in.flush(); log.info("Sent password to third party."); checkForPwd = false; // even if it's wrong, do not // do it again } if (timeoutRestartOnStderr) start = System.currentTimeMillis(); // restart // timeout timer } } catch (IOException ioe) { log.error("<IOException> when reading the stderr: " + ioe + "</IOException>"); readErr = false; } } // sleep a bit to not overload the system if (!finished) try { java.lang.Thread.sleep(100); } catch (InterruptedException ex) { } // check timeout current = System.currentTimeMillis(); if (timeout > 0 && maxtime < current - start) { log.error("Timeout: " + timeout + "s elapsed for command " + command); proc.destroy(); throw new ExecTimeoutException(command); // exitVal = timeoutErrorCode; // finished = true; } } try { // flush to caller out_out.flush(); err_out.flush(); // close streams from/to child process out_in.close(); err_in.close(); proc_in.close(); } catch (IOException ex) { log.error("Could not flush output streams: " + ex); } // System.out.println("ExitValue: " + exitVal); return exitVal; }
From source file:org.apache.flink.test.recovery.ProcessFailureBatchRecoveryITCase.java
@Test public void testTaskManagerProcessFailure() { final StringWriter processOutput1 = new StringWriter(); final StringWriter processOutput2 = new StringWriter(); final StringWriter processOutput3 = new StringWriter(); ActorSystem jmActorSystem = null;/*from www . j a v a 2 s . co m*/ Process taskManagerProcess1 = null; Process taskManagerProcess2 = null; Process taskManagerProcess3 = null; File coordinateTempDir = null; try { // check that we run this test only if the java command // is available on this machine String javaCommand = getJavaCommandPath(); if (javaCommand == null) { System.out.println( "---- Skipping ProcessFailureBatchRecoveryITCase : Could not find java executable"); return; } // create a logging file for the process File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties"); tempLogFile.deleteOnExit(); CommonTestUtils.printLog4jDebugConfig(tempLogFile); // coordination between the processes goes through a directory coordinateTempDir = createTempDirectory(); // find a free port to start the JobManager final int jobManagerPort = NetUtils.getAvailablePort(); // start a JobManager Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort); Configuration jmConfig = new Configuration(); jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "500 ms"); jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2 s"); jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 2); jmConfig.setString(ConfigConstants.DEFAULT_EXECUTION_RETRY_DELAY_KEY, "4 s"); jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<Tuple2<String, Object>>(localAddress)); ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem)._1(); // the TaskManager java command String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) }; // start the first two TaskManager processes taskManagerProcess1 = new ProcessBuilder(command).start(); new PipeForwarder(taskManagerProcess1.getErrorStream(), processOutput1); taskManagerProcess2 = new ProcessBuilder(command).start(); new PipeForwarder(taskManagerProcess2.getErrorStream(), processOutput2); // we wait for the JobManager to have the two TaskManagers available // wait for at most 20 seconds waitUntilNumTaskManagersAreRegistered(jmActor, 2, 20000); // the program will set a marker file in each of its parallel tasks once they are ready, so that // this coordinating code is aware of this. // the program will very slowly consume elements until the marker file (later created by the // test driver code) is present final File coordinateDirClosure = coordinateTempDir; final Throwable[] errorRef = new Throwable[1]; ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", jobManagerPort); env.setDegreeOfParallelism(PARALLELISM); env.setNumberOfExecutionRetries(1); final long NUM_ELEMENTS = 1000000L; final DataSet<Long> result = env.generateSequence(1, NUM_ELEMENTS) // make sure every mapper is involved (no one is skipped because of lazy split assignment) .rebalance() // the majority of the behavior is in the MapFunction .map(new RichMapFunction<Long, Long>() { private final File proceedFile = new File(coordinateDirClosure, PROCEED_MARKER_FILE); private boolean markerCreated = false; private boolean checkForProceedFile = true; @Override public Long map(Long value) throws Exception { if (!markerCreated) { int taskIndex = getRuntimeContext().getIndexOfThisSubtask(); touchFile(new File(coordinateDirClosure, READY_MARKER_FILE_PREFIX + taskIndex)); markerCreated = true; } // check if the proceed file exists if (checkForProceedFile) { if (proceedFile.exists()) { checkForProceedFile = false; } else { // otherwise wait so that we make slow progress Thread.sleep(10); } } return value; } }).reduce(new ReduceFunction<Long>() { @Override public Long reduce(Long value1, Long value2) { return value1 + value2; } }); // we trigger a program now (in a separate thread) Thread programTrigger = new Thread("ProcessFailureBatchRecoveryITCase Program Trigger") { @Override public void run() { try { long sum = result.collect().get(0); assertEquals(NUM_ELEMENTS * (NUM_ELEMENTS + 1L) / 2L, sum); } catch (Throwable t) { t.printStackTrace(); errorRef[0] = t; } } }; programTrigger.start(); // wait until all marker files are in place, indicating that all tasks have started // max 20 seconds waitForMarkerFiles(coordinateTempDir, PARALLELISM, 20000); // start the third TaskManager taskManagerProcess3 = new ProcessBuilder(command).start(); new PipeForwarder(taskManagerProcess3.getErrorStream(), processOutput3); // we wait for the third TaskManager to register (20 seconds max) waitUntilNumTaskManagersAreRegistered(jmActor, 3, 20000); // kill one of the previous TaskManagers, triggering a failure and recovery taskManagerProcess1.destroy(); taskManagerProcess1 = null; // we create the marker file which signals the program functions tasks that they can complete touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE)); // wait for at most 30 seconds for the program to complete programTrigger.join(30000); // check that the program really finished assertFalse("The program did not finish in time", programTrigger.isAlive()); // check whether the program encountered an error if (errorRef[0] != null) { Throwable error = errorRef[0]; error.printStackTrace(); fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage()); } // all seems well :-) } catch (Exception e) { e.printStackTrace(); printProcessLog("TaskManager 1", processOutput1.toString()); printProcessLog("TaskManager 2", processOutput2.toString()); printProcessLog("TaskManager 3", processOutput3.toString()); fail(e.getMessage()); } catch (Error e) { e.printStackTrace(); printProcessLog("TaskManager 1", processOutput1.toString()); printProcessLog("TaskManager 2", processOutput2.toString()); printProcessLog("TaskManager 3", processOutput3.toString()); throw e; } finally { if (taskManagerProcess1 != null) { taskManagerProcess1.destroy(); } if (taskManagerProcess2 != null) { taskManagerProcess2.destroy(); } if (taskManagerProcess3 != null) { taskManagerProcess3.destroy(); } if (jmActorSystem != null) { jmActorSystem.shutdown(); } if (coordinateTempDir != null) { try { FileUtils.deleteDirectory(coordinateTempDir); } catch (Throwable t) { // we can ignore this } } } }