Example usage for org.apache.spark.launcher SparkLauncher launch

List of usage examples for org.apache.spark.launcher SparkLauncher launch

Introduction

In this page you can find the example usage for org.apache.spark.launcher SparkLauncher launch.

Prototype

public Process launch() throws IOException 

Source Link

Document

Launches a sub-process that will start the configured Spark application.

Usage

From source file:com.cloudera.livy.client.local.ContextLauncher.java

License:Apache License

private static ChildProcess startDriver(final RpcServer rpcServer, final LocalConf conf, final String clientId,
        final String secret, final String className) throws IOException {
    final String serverAddress = rpcServer.getAddress();
    final String serverPort = String.valueOf(rpcServer.getPort());
    if (conf.get(CLIENT_IN_PROCESS) != null) {
        // Mostly for testing things quickly. Do not do this in production.
        LOG.warn("!!!! Running remote driver in-process. !!!!");
        Runnable child = new Runnable() {
            @Override//w ww. j av  a  2s .  c om
            public void run() {
                List<String> args = new ArrayList<>();
                args.add("--remote-host");
                args.add(serverAddress);
                args.add("--remote-port");
                args.add(serverPort);
                args.add("--client-id");
                args.add(clientId);
                args.add("--secret");
                args.add(secret);

                for (Map.Entry<String, String> e : conf) {
                    args.add("--conf");
                    args.add(String.format("%s=%s", e.getKey(), e.getValue()));
                }
                try {
                    RemoteDriver.main(args.toArray(new String[args.size()]));
                } catch (Exception e) {
                    LOG.error("Error running driver.", e);
                }
            }
        };
        return new ChildProcess(conf, child);
    } else {
        // If a Spark installation is provided, use the spark-submit script. Otherwise, call the
        // SparkSubmit class directly, which has some caveats (like having to provide a proper
        // version of Guava on the classpath depending on the deploy mode).
        final SparkLauncher launcher = new SparkLauncher();
        String sparkHome = conf.get(SPARK_HOME_KEY);
        if (sparkHome == null) {
            sparkHome = System.getenv(SPARK_HOME_ENV);
        }
        if (sparkHome == null) {
            sparkHome = System.getProperty(SPARK_HOME_KEY);
        }
        launcher.setSparkHome(sparkHome);

        conf.set(CLIENT_ID, clientId);
        conf.set(CLIENT_SECRET, secret);

        launcher.setAppResource("spark-internal");

        String livyJars = conf.get(LIVY_JARS);
        if (livyJars == null) {
            String livyHome = System.getenv("LIVY_HOME");
            Preconditions.checkState(livyHome != null, "Need one of LIVY_HOME or %s set.", LIVY_JARS.key());
            File clientJars = new File(livyHome, "client-jars");
            Preconditions.checkState(clientJars.isDirectory(),
                    "Cannot find 'client-jars' directory under LIVY_HOME.");
            List<String> jars = new ArrayList<>();
            for (File f : clientJars.listFiles()) {
                jars.add(f.getAbsolutePath());
            }
            livyJars = Joiner.on(",").join(jars);
        }

        String userJars = conf.get(SPARK_JARS_KEY);
        if (userJars != null) {
            String allJars = Joiner.on(",").join(livyJars, userJars);
            conf.set(SPARK_JARS_KEY, allJars);
        } else {
            conf.set(SPARK_JARS_KEY, livyJars);
        }

        // Disable multiple attempts since the RPC server doesn't yet support multiple
        // connections for the same registered app.
        conf.set("spark.yarn.maxAppAttempts", "1");

        File confFile = writeConfToFile(conf);

        // Define how to pass options to the child process. If launching in client (or local)
        // mode, the driver options need to be passed directly on the command line. Otherwise,
        // SparkSubmit will take care of that for us.
        String master = conf.get("spark.master");
        Preconditions.checkArgument(master != null, "spark.master is not defined.");
        launcher.setMaster(master);
        launcher.setPropertiesFile(confFile.getAbsolutePath());
        launcher.setMainClass(className);
        if (conf.get(PROXY_USER) != null) {
            launcher.addSparkArg("--proxy-user", conf.get(PROXY_USER));
        }
        launcher.addAppArgs("--remote-host", serverAddress);
        launcher.addAppArgs("--remote-port", serverPort);
        return new ChildProcess(conf, launcher.launch());
    }
}

From source file:com.cloudera.livy.rsc.ContextLauncher.java

License:Apache License

private static ChildProcess startDriver(final RSCConf conf, Promise<?> promise) throws IOException {
    String livyJars = conf.get(LIVY_JARS);
    if (livyJars == null) {
        String livyHome = System.getenv("LIVY_HOME");
        Utils.checkState(livyHome != null, "Need one of LIVY_HOME or %s set.", LIVY_JARS.key());
        File rscJars = new File(livyHome, "rsc-jars");
        if (!rscJars.isDirectory()) {
            rscJars = new File(livyHome, "rsc/target/jars");
        }/*w ww.jav a  2  s  .c  o  m*/
        Utils.checkState(rscJars.isDirectory(), "Cannot find 'client-jars' directory under LIVY_HOME.");
        List<String> jars = new ArrayList<>();
        for (File f : rscJars.listFiles()) {
            jars.add(f.getAbsolutePath());
        }
        livyJars = Utils.join(jars, ",");
    }
    merge(conf, SPARK_JARS_KEY, livyJars, ",");

    String kind = conf.get(SESSION_KIND);
    if ("sparkr".equals(kind)) {
        merge(conf, SPARK_ARCHIVES_KEY, conf.get(RSCConf.Entry.SPARKR_PACKAGE), ",");
    } else if ("pyspark".equals(kind)) {
        merge(conf, "spark.submit.pyFiles", conf.get(RSCConf.Entry.PYSPARK_ARCHIVES), ",");
    }

    // Disable multiple attempts since the RPC server doesn't yet support multiple
    // connections for the same registered app.
    conf.set("spark.yarn.maxAppAttempts", "1");

    // Let the launcher go away when launcher in yarn cluster mode. This avoids keeping lots
    // of "small" Java processes lingering on the Livy server node.
    conf.set("spark.yarn.submit.waitAppCompletion", "false");

    // For testing; propagate jacoco settings so that we also do coverage analysis
    // on the launched driver. We replace the name of the main file ("main.exec")
    // so that we don't end up fighting with the main test launcher.
    String jacocoArgs = System.getProperty("jacoco.args");
    if (jacocoArgs != null) {
        jacocoArgs = jacocoArgs.replace("main.exec", "child.exec");
        merge(conf, SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, jacocoArgs, " ");
    }

    final File confFile = writeConfToFile(conf);

    if (conf.getBoolean(CLIENT_IN_PROCESS)) {
        // Mostly for testing things quickly. Do not do this in production.
        LOG.warn("!!!! Running remote driver in-process. !!!!");
        Runnable child = new Runnable() {
            @Override
            public void run() {
                try {
                    RSCDriverBootstrapper.main(new String[] { confFile.getAbsolutePath() });
                } catch (Exception e) {
                    throw Utils.propagate(e);
                }
            }
        };
        return new ChildProcess(conf, promise, child, confFile);
    } else {
        final SparkLauncher launcher = new SparkLauncher();
        launcher.setSparkHome(System.getenv(SPARK_HOME_ENV));
        launcher.setAppResource("spark-internal");
        launcher.setPropertiesFile(confFile.getAbsolutePath());
        launcher.setMainClass(RSCDriverBootstrapper.class.getName());

        if (conf.get(PROXY_USER) != null) {
            launcher.addSparkArg("--proxy-user", conf.get(PROXY_USER));
        }

        return new ChildProcess(conf, promise, launcher.launch(), confFile);
    }
}

From source file:com.thinkbiganalytics.nifi.pyspark.core.ExecutePySpark.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final ComponentLog logger = getLog();
    FlowFile flowFile = session.get();//  w  w w  .j  a  v  a 2  s.  c  o  m

    if (flowFile == null) {
        flowFile = session.create();
        logger.info("Created a flow file having uuid: {}",
                new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
    } else {
        logger.info("Using an existing flow file having uuid: {}",
                new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
    }
    try {
        final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue();
        final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue();
        final String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES)
                .getValue();
        final String pySparkAppFile = context.getProperty(PYSPARK_APP_FILE)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String pySparkAppArgs = context.getProperty(PYSPARK_APP_ARGS)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String pySparkAppName = context.getProperty(PYSPARK_APP_NAME)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String pySparkAdditionalFiles = context.getProperty(PYSPARK_ADDITIONAL_FILES)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile)
                .getValue().trim().toLowerCase();
        final String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String executorMemory = context.getProperty(EXECUTOR_MEMORY)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String executorInstances = context.getProperty(EXECUTOR_INSTANCES)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String networkTimeout = context.getProperty(NETWORK_TIMEOUT)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String additionalSparkConfigOptions = context.getProperty(ADDITIONAL_SPARK_CONFIG_OPTIONS)
                .evaluateAttributeExpressions(flowFile).getValue();

        PySparkUtils pySparkUtils = new PySparkUtils();

        /* Get app arguments */
        String[] pySparkAppArgsArray = null;
        if (!StringUtils.isEmpty(pySparkAppArgs)) {
            pySparkAppArgsArray = pySparkUtils.getCsvValuesAsArray(pySparkAppArgs);
            logger.info("Provided application arguments: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
        }

        /* Get additional python files */
        String[] pySparkAdditionalFilesArray = null;
        if (!StringUtils.isEmpty(pySparkAdditionalFiles)) {
            pySparkAdditionalFilesArray = pySparkUtils.getCsvValuesAsArray(pySparkAdditionalFiles);
            logger.info("Provided python files: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAdditionalFilesArray) });
        }

        /* Get additional config key-value pairs */
        String[] additionalSparkConfigOptionsArray = null;
        if (!StringUtils.isEmpty(additionalSparkConfigOptions)) {
            additionalSparkConfigOptionsArray = pySparkUtils.getCsvValuesAsArray(additionalSparkConfigOptions);
            logger.info("Provided spark config options: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(additionalSparkConfigOptionsArray) });
        }

        /* Determine if Kerberos is enabled */
        boolean kerberosEnabled = false;
        if (!StringUtils.isEmpty(kerberosPrincipal) && !StringUtils.isEmpty(kerberosKeyTab)
                && !StringUtils.isEmpty(hadoopConfigurationResources)) {
            kerberosEnabled = true;
            logger.info("Kerberos is enabled");
        }

        /* For Kerberized cluster, attempt user authentication */
        if (kerberosEnabled) {
            logger.info("Attempting user authentication for Kerberos");
            ApplySecurityPolicy applySecurityObject = new ApplySecurityPolicy();
            Configuration configuration;
            try {
                logger.info("Getting Hadoop configuration from " + hadoopConfigurationResources);
                configuration = ApplySecurityPolicy.getConfigurationFromResources(hadoopConfigurationResources);

                if (SecurityUtil.isSecurityEnabled(configuration)) {
                    logger.info("Security is enabled");

                    if (kerberosPrincipal.equals("") && kerberosKeyTab.equals("")) {
                        logger.error(
                                "Kerberos Principal and Keytab provided with empty values for a Kerberized cluster.");
                        session.transfer(flowFile, REL_FAILURE);
                        return;
                    }

                    try {
                        logger.info("User authentication initiated");

                        boolean authenticationStatus = applySecurityObject.validateUserWithKerberos(logger,
                                hadoopConfigurationResources, kerberosPrincipal, kerberosKeyTab);
                        if (authenticationStatus) {
                            logger.info("User authenticated successfully.");
                        } else {
                            logger.error("User authentication failed.");
                            session.transfer(flowFile, REL_FAILURE);
                            return;
                        }

                    } catch (Exception unknownException) {
                        logger.error("Unknown exception occurred while validating user :"
                                + unknownException.getMessage());
                        session.transfer(flowFile, REL_FAILURE);
                        return;
                    }
                }
            } catch (IOException e1) {
                logger.error("Unknown exception occurred while authenticating user :" + e1.getMessage());
                session.transfer(flowFile, REL_FAILURE);
                return;
            }
        }

        /* Build and launch PySpark Job */
        logger.info("Configuring PySpark job for execution");
        SparkLauncher pySparkLauncher = new SparkLauncher().setAppResource(pySparkAppFile);
        logger.info("PySpark app file set to: {}", new Object[] { pySparkAppFile });

        if (pySparkAppArgsArray != null && pySparkAppArgsArray.length > 0) {
            pySparkLauncher = pySparkLauncher.addAppArgs(pySparkAppArgsArray);
            logger.info("App arguments set to: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
        }

        pySparkLauncher = pySparkLauncher.setAppName(pySparkAppName).setMaster(sparkMaster);

        logger.info("App name set to: {}", new Object[] { pySparkAppName });
        logger.info("Spark master set to: {}", new Object[] { sparkMaster });

        if (pySparkAdditionalFilesArray != null && pySparkAdditionalFilesArray.length > 0) {
            for (String pySparkAdditionalFile : pySparkAdditionalFilesArray) {
                pySparkLauncher = pySparkLauncher.addPyFile(pySparkAdditionalFile);
                logger.info("Additional python file set to: {}", new Object[] { pySparkAdditionalFile });
            }
        }

        if (sparkMaster.equals("yarn")) {
            pySparkLauncher = pySparkLauncher.setDeployMode(sparkYarnDeployMode);
            logger.info("YARN deploy mode set to: {}", new Object[] { sparkYarnDeployMode });
        }

        pySparkLauncher = pySparkLauncher.setSparkHome(sparkHome)
                .setConf(SparkLauncher.DRIVER_MEMORY, driverMemory)
                .setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory)
                .setConf(CONFIG_PROP_SPARK_EXECUTOR_INSTANCES, executorInstances)
                .setConf(SparkLauncher.EXECUTOR_CORES, executorCores)
                .setConf(CONFIG_PROP_SPARK_NETWORK_TIMEOUT, networkTimeout);

        logger.info("Spark home set to: {} ", new Object[] { sparkHome });
        logger.info("Driver memory set to: {} ", new Object[] { driverMemory });
        logger.info("Executor memory set to: {} ", new Object[] { executorMemory });
        logger.info("Executor instances set to: {} ", new Object[] { executorInstances });
        logger.info("Executor cores set to: {} ", new Object[] { executorCores });
        logger.info("Network timeout set to: {} ", new Object[] { networkTimeout });

        if (kerberosEnabled) {
            pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_PRINCIPAL, kerberosPrincipal);
            pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_KEYTAB, kerberosKeyTab);
            logger.info("Kerberos principal set to: {} ", new Object[] { kerberosPrincipal });
            logger.info("Kerberos keytab set to: {} ", new Object[] { kerberosKeyTab });
        }

        if (!StringUtils.isEmpty(yarnQueue)) {
            pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_QUEUE, yarnQueue);
            logger.info("YARN queue set to: {} ", new Object[] { yarnQueue });
        }

        if (additionalSparkConfigOptionsArray != null && additionalSparkConfigOptionsArray.length > 0) {
            for (String additionalSparkConfigOption : additionalSparkConfigOptionsArray) {
                String[] confKeyValue = additionalSparkConfigOption.split("=");
                if (confKeyValue.length == 2) {
                    pySparkLauncher = pySparkLauncher.setConf(confKeyValue[0], confKeyValue[1]);
                    logger.info("Spark additional config option set to: {}={}",
                            new Object[] { confKeyValue[0], confKeyValue[1] });
                }
            }
        }

        logger.info("Starting execution of PySpark job");
        Process pySparkProcess = pySparkLauncher.launch();

        InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO,
                logger, pySparkProcess.getInputStream());
        Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input");
        inputThread.start();

        InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO,
                logger, pySparkProcess.getErrorStream());
        Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error");
        errorThread.start();

        logger.info("Waiting for PySpark job to complete");

        int exitCode = pySparkProcess.waitFor();
        if (exitCode != 0) {
            logger.info("Finished execution of PySpark job [FAILURE] [Status code: {}]",
                    new Object[] { exitCode });
            session.transfer(flowFile, REL_FAILURE);
        } else {
            logger.info("Finished execution of PySpark job [SUCCESS] [Status code: {}]",
                    new Object[] { exitCode });
            session.transfer(flowFile, REL_SUCCESS);
        }
    } catch (final Exception e) {
        logger.error("Unable to execute PySpark job [FAILURE]", new Object[] { flowFile, e });
        session.transfer(flowFile, REL_FAILURE);
    }
}

From source file:com.uber.hoodie.cli.commands.CommitsCommand.java

License:Apache License

@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(
        @CliOption(key = { "commit" }, help = "Commit to rollback") final String commitTime,
        @CliOption(key = {/*  ww w  .ja  va  2  s .com*/
                "sparkProperties" }, help = "Spark Properites File Path") final String sparkPropertiesPath)
        throws Exception {
    HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
    HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
    HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);

    if (!timeline.containsInstant(commitInstant)) {
        return "Commit " + commitTime + " not found in Commits " + timeline;
    }

    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime,
            HoodieCLI.tableMetadata.getBasePath());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    // Refresh the current
    refreshCommits();
    if (exitCode != 0) {
        return "Commit " + commitTime + " failed to roll back";
    }
    return "Commit " + commitTime + " rolled back";
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory") final String sparkMemory)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);/*w w  w  .  j a  va 2 s.co m*/

    // First get a compaction instant time and pass it to spark launcher for scheduling compaction
    String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime();

    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
        SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
        sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(),
                HoodieCLI.tableMetadata.getBasePath(), HoodieCLI.tableMetadata.getTableConfig().getTableName(),
                compactionInstantTime, sparkMemory);
        Process process = sparkLauncher.launch();
        InputStreamConsumer.captureOutput(process);
        int exitCode = process.waitFor();
        if (exitCode != 0) {
            return "Failed to run compaction for " + compactionInstantTime;
        }
        return "Compaction successfully completed for " + compactionInstantTime;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(@CliOption(key = {
        "parallelism" }, mandatory = true, help = "Parallelism for hoodie compaction") final String parallelism,
        @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String schemaFilePath,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory,
        @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
        @CliOption(key = "compactionInstant", mandatory = true, help = "Base path for the target hoodie dataset") final String compactionInstantTime)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);/*ww  w  . j a  v  a  2 s .  c o m*/

    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
        SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
        sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
                HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism,
                schemaFilePath, sparkMemory, retry);
        Process process = sparkLauncher.launch();
        InputStreamConsumer.captureOutput(process);
        int exitCode = process.waitFor();
        if (exitCode != 0) {
            return "Failed to run compaction for " + compactionInstantTime;
        }
        return "Compaction successfully completed for " + compactionInstantTime;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction validate", help = "Validate Compaction")
public String validateCompaction(
        @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
        @CliOption(key = {/*from w  ww  . ja  v  a 2  s. co m*/
                "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = null;
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism,
                    master, sparkMemory);
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to validate compaction for " + compactionInstant;
            }
            List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            boolean valid = res.stream().map(r -> r.isSuccess()).reduce(Boolean::logicalAnd).orElse(true);
            String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
            List<Comparable[]> rows = new ArrayList<>();
            res.stream().forEach(r -> {
                Comparable[] row = new Comparable[] { r.getOperation().getFileId(),
                        r.getOperation().getBaseInstantTime(),
                        r.getOperation().getDataFilePath().isPresent()
                                ? r.getOperation().getDataFilePath().get()
                                : "",
                        r.getOperation().getDeltaFilePaths().size(), r.isSuccess(),
                        r.getException().isPresent() ? r.getException().get().getMessage() : "" };
                rows.add(row);
            });

            Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
            TableHeader header = new TableHeader().addTableHeaderField("File Id")
                    .addTableHeaderField("Base Instant Time").addTableHeaderField("Base Data File")
                    .addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
                    .addTableHeaderField("Error");

            output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending,
                    limit, headerOnly, rows);
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction")
public String unscheduleCompaction(
        @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
        @CliOption(key = {/*from w ww .jav a  2s .c  o  m*/
                "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = {
                "skipValidation" }, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
        @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = "";
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism,
                    master, sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to unschedule compaction for " + compactionInstant;
            }
            List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
                    "unschedule pending compaction");
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId")
public String unscheduleCompactFile(
        @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = {/*from   w  w w . ja v  a  2 s  . c om*/
                "skipValidation" }, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
        @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = "";
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master, sparkMemory,
                    Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to unschedule compaction for file " + fileId;
            }
            List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
                    "unschedule file from pending compaction");
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as "
        + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.")
public String repairCompaction(
        @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
        @CliOption(key = {/*from  w  w  w.j a  v  a  2 s  .c o  m*/
                "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);
    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = "";
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism,
                    master, sparkMemory, Boolean.valueOf(dryRun).toString());
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to unschedule compaction for " + compactionInstant;
            }
            List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
                    "repair compaction");
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}