Example usage for org.apache.spark.launcher SparkLauncher SparkLauncher

Introduction

In this page you can find the example usage for org.apache.spark.launcher SparkLauncher SparkLauncher.

Prototype

public SparkLauncher(Map<String, String> env)

Source Link

Document

Creates a launcher that will set the given environment variables in the child.

Usage

From source file:com.ebay.logstream.runner.spark.SparkPipelineRunner.java

License:Apache License

@Override
public Map<String, Object> run(Pipeline pipeline) {
    Map<String, Object> result = new HashMap<>();
    Map<String, String> env = Maps.newHashMap();
    env.put("SPARK_PRINT_LAUNCH_COMMAND", "1");
    SparkLauncher launcher = new SparkLauncher(env);
    launcher.setAppResource(pipeline.getContext().getPipelineJarPath());
    launcher.setAppName(pipeline.getContext().getPipelineName());
    launcher.setMainClass(SparkPipelineRunner.class.getCanonicalName());
    launcher.setSparkHome(pipeline.getContext().getConfig().getString(SPARK_HOME_KEY));
    launcher.setJavaHome(pipeline.getContext().getConfig().getString(JAVA_HOME));
    //set app args
    launcher.addAppArgs(pipeline.getContext().getPipeline());
    launcher.addAppArgs(pipeline.getContext().getPipelineName());
    launcher.addAppArgs(pipeline.getContext().getDeployMode().toString());
    launcher.addAppArgs(pipeline.getContext().getInputParallelism() + "");
    launcher.addAppArgs(pipeline.getContext().getFilterParallelism() + "");
    launcher.addAppArgs(pipeline.getContext().getOutputParallelism() + "");
    //work around(for get driver pid)
    String uuid = UUID.randomUUID().toString();
    launcher.addAppArgs(uuid);/*from   w  w w  . j  av  a2 s .  c  o m*/
    launcher.addAppArgs();
    launcher.setVerbose(true);
    launcher.addSparkArg("--verbose");
    if (pipeline.getContext().getDeployMode() == LogStormConstants.DeployMode.LOCAL) {
        launcher.setMaster("local[*]");
    } else {
        launcher.setMaster(pipeline.getContext().getConfig().getString(SPARK_MASTER_KEY));
    }

    try {
        SparkAppHandle handle = launcher.startApplication();
        while (handle.getAppId() == null) {
            Thread.sleep(1000);
        }
        result.put("applicationId", handle.getAppId());
        LOG.info("generate spark applicationId " + handle.getAppId());
        //get driver pid
        String cmd = "ps -ef | grep " + uuid + " | grep -v grep | awk '{print $2}'";
        LOG.info("cmd {}", cmd);
        Process process = Runtime.getRuntime().exec(new String[] { "/bin/sh", "-c", cmd });
        synchronized (process) {
            try {
                process.wait();
            } catch (Exception e) {
                LOG.warn("failed to wait driver pid: ", e);
            }
        }
        InputStream inputStream = process.getInputStream();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        String pid;
        while ((pid = bufferedReader.readLine()) != null) {
            result.put("driverPid", pid);
            System.out.println(pid);
        }
        bufferedReader.close();
    } catch (Exception e) {
        LOG.error("failed to start as a spark application, ", e);
    }

    return result;
}

From source file:com.streamsets.datacollector.pipeline.executor.spark.yarn.YarnAppLauncher.java

License:Apache License

@VisibleForTesting
protected SparkLauncher getLauncher() {
    return new SparkLauncher(yarnConfigs.env);
}

From source file:com.thinkbiganalytics.nifi.v2.spark.ExecuteSparkJob.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    final ComponentLog logger = getLog();
    FlowFile flowFile = session.get();/*from  w  ww. ja v  a 2  s . co m*/
    if (flowFile == null) {
        return;
    }
    String PROVENANCE_JOB_STATUS_KEY = "Job Status";
    String PROVENANCE_SPARK_EXIT_CODE_KEY = "Spark Exit Code";

    try {

        PROVENANCE_JOB_STATUS_KEY = context.getName() + " Job Status";
        PROVENANCE_SPARK_EXIT_CODE_KEY = context.getName() + " Spark Exit Code";

        /* Configuration parameters for spark launcher */
        String appJar = getApplicationJar(context, flowFile);
        String mainClass = getMainClass(context, flowFile);
        String[] appArgs = getMainArgs(context, flowFile);
        String extraJars = getExtraJars(context, flowFile);
        String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile).getValue();
        String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile).getValue()
                .trim();
        String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE)
                .evaluateAttributeExpressions(flowFile).getValue();
        String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile)
                .getValue();
        String executorMemory = context.getProperty(EXECUTOR_MEMORY).evaluateAttributeExpressions(flowFile)
                .getValue();
        String numberOfExecutors = context.getProperty(NUMBER_EXECUTORS).evaluateAttributeExpressions(flowFile)
                .getValue();
        String sparkApplicationName = context.getProperty(SPARK_APPLICATION_NAME)
                .evaluateAttributeExpressions(flowFile).getValue();
        String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile)
                .getValue();
        String networkTimeout = context.getProperty(NETWORK_TIMEOUT).evaluateAttributeExpressions(flowFile)
                .getValue();
        String principal = context.getProperty(kerberosPrincipal).getValue();
        String keyTab = context.getProperty(kerberosKeyTab).getValue();
        String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue();
        String sparkConfs = context.getProperty(SPARK_CONFS).evaluateAttributeExpressions(flowFile).getValue();
        String extraFiles = context.getProperty(EXTRA_SPARK_FILES).evaluateAttributeExpressions(flowFile)
                .getValue();
        Integer sparkProcessTimeout = context.getProperty(PROCESS_TIMEOUT)
                .evaluateAttributeExpressions(flowFile).asTimePeriod(TimeUnit.SECONDS).intValue();
        String datasourceIds = context.getProperty(DATASOURCES).evaluateAttributeExpressions(flowFile)
                .getValue();
        String catalogDataSourceIds = context.getProperty(CATALOG_DATASOURCES)
                .evaluateAttributeExpressions(flowFile).getValue();
        String dataSetIds = context.getProperty(DATASETS).evaluateAttributeExpressions(flowFile).getValue();
        MetadataProviderService metadataService = context.getProperty(METADATA_SERVICE)
                .asControllerService(MetadataProviderService.class);

        final List<String> extraJarPaths = getExtraJarPaths(extraJars);

        // If all 3 fields are filled out then assume kerberos is enabled, and user should be authenticated
        boolean isAuthenticated = !StringUtils.isEmpty(principal) && !StringUtils.isEmpty(keyTab)
                && !StringUtils.isEmpty(hadoopConfigurationResources);
        try {
            if (isAuthenticated && isSecurityEnabled(hadoopConfigurationResources)) {
                logger.info("Security is enabled");

                if (principal.equals("") && keyTab.equals("")) {
                    logger.error(
                            "Kerberos Principal and Kerberos KeyTab information missing in Kerboeros enabled cluster. {} ",
                            new Object[] { flowFile });
                    session.transfer(flowFile, REL_FAILURE);
                    return;
                }

                logger.info("User authentication initiated");

                boolean authenticationStatus = new ApplySecurityPolicy().validateUserWithKerberos(logger,
                        hadoopConfigurationResources, principal, keyTab);
                if (authenticationStatus) {
                    logger.info("User authenticated successfully.");
                } else {
                    logger.error("User authentication failed.  {} ", new Object[] { flowFile });
                    session.transfer(flowFile, REL_FAILURE);
                    return;
                }
            }
        } catch (IOException e1) {
            logger.error("Unknown exception occurred while authenticating user : {} and flow file: {}",
                    new Object[] { e1.getMessage(), flowFile });
            session.transfer(flowFile, REL_FAILURE);
            return;

        } catch (Exception unknownException) {
            logger.error("Unknown exception occurred while validating user : {}.  {} ",
                    new Object[] { unknownException.getMessage(), flowFile });
            session.transfer(flowFile, REL_FAILURE);
            return;
        }

        String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile).getValue();

        // Build environment
        final Map<String, String> env = getDatasources(session, flowFile, PROVENANCE_JOB_STATUS_KEY,
                datasourceIds, dataSetIds, catalogDataSourceIds, metadataService, extraJarPaths);
        if (env != null) {
            StringBuilder datasourceSummary = new StringBuilder();

            if (env.containsKey("DATASETS")) {
                final int count = StringUtils.countMatches("DATASETS", ',') + 1;
                datasourceSummary.append(count).append(" datasets");
            }
            if (env.containsKey("DATASOURCES")) {
                final int count = StringUtils.countMatches("DATASOURCES", ',') + 1;
                (datasourceSummary.length() > 0 ? datasourceSummary.append("; ") : datasourceSummary)
                        .append(count).append(" legacy datasources");
            }
            if (env.containsKey("CATALOG_DATASOURCES")) {
                final int count = StringUtils.countMatches("CATALOG_DATASOURCES", ',') + 1;
                (datasourceSummary.length() > 0 ? datasourceSummary.append("; ") : datasourceSummary)
                        .append(count).append(" catalog datasources");
            }

            String summaryString = datasourceSummary.toString();
            if (StringUtils.isNotBlank(summaryString)) {
                flowFile = session.putAttribute(flowFile, "Data source usage", summaryString);
            }
        } else {
            return;
        }

        addEncryptionSettings(env);

        /* Launch the spark job as a child process */
        SparkLauncher launcher = new SparkLauncher(env).setAppResource(appJar).setMainClass(mainClass)
                .setMaster(sparkMaster).setConf(SparkLauncher.DRIVER_MEMORY, driverMemory)
                .setConf(SPARK_NUM_EXECUTORS, numberOfExecutors)
                .setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory)
                .setConf(SparkLauncher.EXECUTOR_CORES, executorCores)
                .setConf(SPARK_NETWORK_TIMEOUT_CONFIG_NAME, networkTimeout).setSparkHome(sparkHome)
                .setAppName(sparkApplicationName);

        OptionalSparkConfigurator optionalSparkConf = new OptionalSparkConfigurator(launcher)
                .setDeployMode(sparkMaster, sparkYarnDeployMode)
                .setAuthentication(isAuthenticated, keyTab, principal).addAppArgs(appArgs)
                .addSparkArg(sparkConfs).addExtraJars(extraJarPaths).setYarnQueue(yarnQueue)
                .setExtraFiles(extraFiles);

        Process spark = optionalSparkConf.getLaucnher().launch();

        /* Read/clear the process input stream */
        InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO,
                logger, spark.getInputStream());
        Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input");
        inputThread.start();

        /* Read/clear the process error stream */
        InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO,
                logger, spark.getErrorStream());
        Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error");
        errorThread.start();

        logger.info("Waiting for Spark job to complete");

        /* Wait for job completion */
        boolean completed = spark.waitFor(sparkProcessTimeout, TimeUnit.SECONDS);
        if (!completed) {
            spark.destroyForcibly();
            getLog().error("Spark process timed out after {} seconds using flow file: {}  ",
                    new Object[] { sparkProcessTimeout, flowFile });
            session.transfer(flowFile, REL_FAILURE);
            return;
        }

        int exitCode = spark.exitValue();

        flowFile = session.putAttribute(flowFile, PROVENANCE_SPARK_EXIT_CODE_KEY, Integer.toString(exitCode));
        if (exitCode != 0) {
            logger.error("ExecuteSparkJob for {} and flowfile: {} completed with failed status {} ",
                    new Object[] { context.getName(), flowFile, exitCode });
            flowFile = session.putAttribute(flowFile, PROVENANCE_JOB_STATUS_KEY, "Failed");
            session.transfer(flowFile, REL_FAILURE);
        } else {
            logger.info("ExecuteSparkJob for {} and flowfile: {} completed with success status {} ",
                    new Object[] { context.getName(), flowFile, exitCode });
            flowFile = session.putAttribute(flowFile, PROVENANCE_JOB_STATUS_KEY, "Success");
            session.transfer(flowFile, REL_SUCCESS);
        }
    } catch (final Exception e) {
        logger.error("Unable to execute Spark job {},{}", new Object[] { flowFile, e.getMessage() }, e);
        flowFile = session.putAttribute(flowFile, PROVENANCE_JOB_STATUS_KEY, "Failed With Exception");
        flowFile = session.putAttribute(flowFile, "Spark Exception:", e.getMessage());
        session.transfer(flowFile, REL_FAILURE);
    }
}

From source file:com.thinkbiganalytics.spark.shell.SparkShellProcessBuilder.java

License:Apache License

/**
 * Constructs a {@code SparkShellProcessBuilder}.
 *///from  www .  j a  v a  2  s  .com
public SparkShellProcessBuilder() {
    // Generate client id and secret
    clientId = UUID.randomUUID().toString();
    clientSecret = UUID.randomUUID().toString();

    // Create Spark Launcher
    final Map<String, String> env = ImmutableMap.<String, String>builder().put(CLIENT_ID, clientId)
            .put(CLIENT_SECRET, clientSecret).build();
    launcher = new SparkLauncher(env).setConf("spark.driver.userClassPathFirst", "true")
            .setConf("spark.yarn.appMasterEnv." + CLIENT_ID, clientId)
            .setConf("spark.yarn.appMasterEnv." + CLIENT_SECRET, clientSecret)
            .setMainClass("com.thinkbiganalytics.spark.SparkShellApp");
}

From source file:org.datacleaner.spark.ApplicationDriver.java

License:Open Source License

public SparkLauncher createSparkLauncher(File hadoopConfDir, String configurationHdfsPath, String jobHdfsPath)
        throws Exception {
    // mimic env. variables
    final Map<String, String> env = new HashMap<>();
    env.put("YARN_CONF_DIR", hadoopConfDir.getAbsolutePath());

    final SparkLauncher sparkLauncher = new SparkLauncher(env);

    sparkLauncher.setSparkHome(_sparkHome);
    sparkLauncher.setMaster("yarn-cluster");
    sparkLauncher.setAppName("DataCleaner");

    final MutableRef<String> primaryJar = new MutableRef<>();
    final List<String> jars = buildJarFiles(primaryJar);
    logger.info("Using JAR files: {}", jars);

    for (final String jar : jars) {
        sparkLauncher.addJar(jar);//w  ww  . j  a va  2 s . c om
    }
    sparkLauncher.setMainClass(Main.class.getName());

    // the primary jar is always the first argument
    sparkLauncher.addAppArgs(primaryJar.get());

    sparkLauncher.addAppArgs(toHdfsPath(configurationHdfsPath));
    sparkLauncher.addAppArgs(toHdfsPath(jobHdfsPath));

    return sparkLauncher;
}