Example usage for org.apache.spark.launcher SparkLauncher launch

Introduction

In this page you can find the example usage for org.apache.spark.launcher SparkLauncher launch.

Prototype

public Process launch() throws IOException

Source Link

Document

Launches a sub-process that will start the configured Spark application.

Usage

From source file:com.cloudera.livy.client.local.ContextLauncher.java

License:Apache License

private static ChildProcess startDriver(final RpcServer rpcServer, final LocalConf conf, final String clientId,
        final String secret, final String className) throws IOException {
    final String serverAddress = rpcServer.getAddress();
    final String serverPort = String.valueOf(rpcServer.getPort());
    if (conf.get(CLIENT_IN_PROCESS) != null) {
        // Mostly for testing things quickly. Do not do this in production.
        LOG.warn("!!!! Running remote driver in-process. !!!!");
        Runnable child = new Runnable() {
            @Override//w ww. j av  a  2s .  c om
            public void run() {
                List<String> args = new ArrayList<>();
                args.add("--remote-host");
                args.add(serverAddress);
                args.add("--remote-port");
                args.add(serverPort);
                args.add("--client-id");
                args.add(clientId);
                args.add("--secret");
                args.add(secret);

                for (Map.Entry<String, String> e : conf) {
                    args.add("--conf");
                    args.add(String.format("%s=%s", e.getKey(), e.getValue()));
                }
                try {
                    RemoteDriver.main(args.toArray(new String[args.size()]));
                } catch (Exception e) {
                    LOG.error("Error running driver.", e);
                }
            }
        };
        return new ChildProcess(conf, child);
    } else {
        // If a Spark installation is provided, use the spark-submit script. Otherwise, call the
        // SparkSubmit class directly, which has some caveats (like having to provide a proper
        // version of Guava on the classpath depending on the deploy mode).
        final SparkLauncher launcher = new SparkLauncher();
        String sparkHome = conf.get(SPARK_HOME_KEY);
        if (sparkHome == null) {
            sparkHome = System.getenv(SPARK_HOME_ENV);
        }
        if (sparkHome == null) {
            sparkHome = System.getProperty(SPARK_HOME_KEY);
        }
        launcher.setSparkHome(sparkHome);

        conf.set(CLIENT_ID, clientId);
        conf.set(CLIENT_SECRET, secret);

        launcher.setAppResource("spark-internal");

        String livyJars = conf.get(LIVY_JARS);
        if (livyJars == null) {
            String livyHome = System.getenv("LIVY_HOME");
            Preconditions.checkState(livyHome != null, "Need one of LIVY_HOME or %s set.", LIVY_JARS.key());
            File clientJars = new File(livyHome, "client-jars");
            Preconditions.checkState(clientJars.isDirectory(),
                    "Cannot find 'client-jars' directory under LIVY_HOME.");
            List<String> jars = new ArrayList<>();
            for (File f : clientJars.listFiles()) {
                jars.add(f.getAbsolutePath());
            }
            livyJars = Joiner.on(",").join(jars);
        }

        String userJars = conf.get(SPARK_JARS_KEY);
        if (userJars != null) {
            String allJars = Joiner.on(",").join(livyJars, userJars);
            conf.set(SPARK_JARS_KEY, allJars);
        } else {
            conf.set(SPARK_JARS_KEY, livyJars);
        }

        // Disable multiple attempts since the RPC server doesn't yet support multiple
        // connections for the same registered app.
        conf.set("spark.yarn.maxAppAttempts", "1");

        File confFile = writeConfToFile(conf);

        // Define how to pass options to the child process. If launching in client (or local)
        // mode, the driver options need to be passed directly on the command line. Otherwise,
        // SparkSubmit will take care of that for us.
        String master = conf.get("spark.master");
        Preconditions.checkArgument(master != null, "spark.master is not defined.");
        launcher.setMaster(master);
        launcher.setPropertiesFile(confFile.getAbsolutePath());
        launcher.setMainClass(className);
        if (conf.get(PROXY_USER) != null) {
            launcher.addSparkArg("--proxy-user", conf.get(PROXY_USER));
        }
        launcher.addAppArgs("--remote-host", serverAddress);
        launcher.addAppArgs("--remote-port", serverPort);
        return new ChildProcess(conf, launcher.launch());
    }
}

From source file:com.cloudera.livy.rsc.ContextLauncher.java

License:Apache License

private static ChildProcess startDriver(final RSCConf conf, Promise<?> promise) throws IOException {
    String livyJars = conf.get(LIVY_JARS);
    if (livyJars == null) {
        String livyHome = System.getenv("LIVY_HOME");
        Utils.checkState(livyHome != null, "Need one of LIVY_HOME or %s set.", LIVY_JARS.key());
        File rscJars = new File(livyHome, "rsc-jars");
        if (!rscJars.isDirectory()) {
            rscJars = new File(livyHome, "rsc/target/jars");
        }/*w ww.jav a  2  s  .c  o  m*/
        Utils.checkState(rscJars.isDirectory(), "Cannot find 'client-jars' directory under LIVY_HOME.");
        List<String> jars = new ArrayList<>();
        for (File f : rscJars.listFiles()) {
            jars.add(f.getAbsolutePath());
        }
        livyJars = Utils.join(jars, ",");
    }
    merge(conf, SPARK_JARS_KEY, livyJars, ",");

    String kind = conf.get(SESSION_KIND);
    if ("sparkr".equals(kind)) {
        merge(conf, SPARK_ARCHIVES_KEY, conf.get(RSCConf.Entry.SPARKR_PACKAGE), ",");
    } else if ("pyspark".equals(kind)) {
        merge(conf, "spark.submit.pyFiles", conf.get(RSCConf.Entry.PYSPARK_ARCHIVES), ",");
    }

    // Disable multiple attempts since the RPC server doesn't yet support multiple
    // connections for the same registered app.
    conf.set("spark.yarn.maxAppAttempts", "1");

    // Let the launcher go away when launcher in yarn cluster mode. This avoids keeping lots
    // of "small" Java processes lingering on the Livy server node.
    conf.set("spark.yarn.submit.waitAppCompletion", "false");

    // For testing; propagate jacoco settings so that we also do coverage analysis
    // on the launched driver. We replace the name of the main file ("main.exec")
    // so that we don't end up fighting with the main test launcher.
    String jacocoArgs = System.getProperty("jacoco.args");
    if (jacocoArgs != null) {
        jacocoArgs = jacocoArgs.replace("main.exec", "child.exec");
        merge(conf, SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, jacocoArgs, " ");
    }

    final File confFile = writeConfToFile(conf);

    if (conf.getBoolean(CLIENT_IN_PROCESS)) {
        // Mostly for testing things quickly. Do not do this in production.
        LOG.warn("!!!! Running remote driver in-process. !!!!");
        Runnable child = new Runnable() {
            @Override
            public void run() {
                try {
                    RSCDriverBootstrapper.main(new String[] { confFile.getAbsolutePath() });
                } catch (Exception e) {
                    throw Utils.propagate(e);
                }
            }
        };
        return new ChildProcess(conf, promise, child, confFile);
    } else {
        final SparkLauncher launcher = new SparkLauncher();
        launcher.setSparkHome(System.getenv(SPARK_HOME_ENV));
        launcher.setAppResource("spark-internal");
        launcher.setPropertiesFile(confFile.getAbsolutePath());
        launcher.setMainClass(RSCDriverBootstrapper.class.getName());

        if (conf.get(PROXY_USER) != null) {
            launcher.addSparkArg("--proxy-user", conf.get(PROXY_USER));
        }

        return new ChildProcess(conf, promise, launcher.launch(), confFile);
    }
}

From source file:com.thinkbiganalytics.nifi.pyspark.core.ExecutePySpark.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final ComponentLog logger = getLog();
    FlowFile flowFile = session.get();//  w  w w  .j  a  v  a 2  s.  c  o  m

    if (flowFile == null) {
        flowFile = session.create();
        logger.info("Created a flow file having uuid: {}",
                new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
    } else {
        logger.info("Using an existing flow file having uuid: {}",
                new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
    }
    try {
        final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue();
        final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue();
        final String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES)
                .getValue();
        final String pySparkAppFile = context.getProperty(PYSPARK_APP_FILE)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String pySparkAppArgs = context.getProperty(PYSPARK_APP_ARGS)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String pySparkAppName = context.getProperty(PYSPARK_APP_NAME)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String pySparkAdditionalFiles = context.getProperty(PYSPARK_ADDITIONAL_FILES)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile)
                .getValue().trim().toLowerCase();
        final String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String executorMemory = context.getProperty(EXECUTOR_MEMORY)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String executorInstances = context.getProperty(EXECUTOR_INSTANCES)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile)
                .getValue();
        final String networkTimeout = context.getProperty(NETWORK_TIMEOUT)
                .evaluateAttributeExpressions(flowFile).getValue();
        final String additionalSparkConfigOptions = context.getProperty(ADDITIONAL_SPARK_CONFIG_OPTIONS)
                .evaluateAttributeExpressions(flowFile).getValue();

        PySparkUtils pySparkUtils = new PySparkUtils();

        /* Get app arguments */
        String[] pySparkAppArgsArray = null;
        if (!StringUtils.isEmpty(pySparkAppArgs)) {
            pySparkAppArgsArray = pySparkUtils.getCsvValuesAsArray(pySparkAppArgs);
            logger.info("Provided application arguments: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
        }

        /* Get additional python files */
        String[] pySparkAdditionalFilesArray = null;
        if (!StringUtils.isEmpty(pySparkAdditionalFiles)) {
            pySparkAdditionalFilesArray = pySparkUtils.getCsvValuesAsArray(pySparkAdditionalFiles);
            logger.info("Provided python files: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAdditionalFilesArray) });
        }

        /* Get additional config key-value pairs */
        String[] additionalSparkConfigOptionsArray = null;
        if (!StringUtils.isEmpty(additionalSparkConfigOptions)) {
            additionalSparkConfigOptionsArray = pySparkUtils.getCsvValuesAsArray(additionalSparkConfigOptions);
            logger.info("Provided spark config options: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(additionalSparkConfigOptionsArray) });
        }

        /* Determine if Kerberos is enabled */
        boolean kerberosEnabled = false;
        if (!StringUtils.isEmpty(kerberosPrincipal) && !StringUtils.isEmpty(kerberosKeyTab)
                && !StringUtils.isEmpty(hadoopConfigurationResources)) {
            kerberosEnabled = true;
            logger.info("Kerberos is enabled");
        }

        /* For Kerberized cluster, attempt user authentication */
        if (kerberosEnabled) {
            logger.info("Attempting user authentication for Kerberos");
            ApplySecurityPolicy applySecurityObject = new ApplySecurityPolicy();
            Configuration configuration;
            try {
                logger.info("Getting Hadoop configuration from " + hadoopConfigurationResources);
                configuration = ApplySecurityPolicy.getConfigurationFromResources(hadoopConfigurationResources);

                if (SecurityUtil.isSecurityEnabled(configuration)) {
                    logger.info("Security is enabled");

                    if (kerberosPrincipal.equals("") && kerberosKeyTab.equals("")) {
                        logger.error(
                                "Kerberos Principal and Keytab provided with empty values for a Kerberized cluster.");
                        session.transfer(flowFile, REL_FAILURE);
                        return;
                    }

                    try {
                        logger.info("User authentication initiated");

                        boolean authenticationStatus = applySecurityObject.validateUserWithKerberos(logger,
                                hadoopConfigurationResources, kerberosPrincipal, kerberosKeyTab);
                        if (authenticationStatus) {
                            logger.info("User authenticated successfully.");
                        } else {
                            logger.error("User authentication failed.");
                            session.transfer(flowFile, REL_FAILURE);
                            return;
                        }

                    } catch (Exception unknownException) {
                        logger.error("Unknown exception occurred while validating user :"
                                + unknownException.getMessage());
                        session.transfer(flowFile, REL_FAILURE);
                        return;
                    }
                }
            } catch (IOException e1) {
                logger.error("Unknown exception occurred while authenticating user :" + e1.getMessage());
                session.transfer(flowFile, REL_FAILURE);
                return;
            }
        }

        /* Build and launch PySpark Job */
        logger.info("Configuring PySpark job for execution");
        SparkLauncher pySparkLauncher = new SparkLauncher().setAppResource(pySparkAppFile);
        logger.info("PySpark app file set to: {}", new Object[] { pySparkAppFile });

        if (pySparkAppArgsArray != null && pySparkAppArgsArray.length > 0) {
            pySparkLauncher = pySparkLauncher.addAppArgs(pySparkAppArgsArray);
            logger.info("App arguments set to: {}",
                    new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
        }

        pySparkLauncher = pySparkLauncher.setAppName(pySparkAppName).setMaster(sparkMaster);

        logger.info("App name set to: {}", new Object[] { pySparkAppName });
        logger.info("Spark master set to: {}", new Object[] { sparkMaster });

        if (pySparkAdditionalFilesArray != null && pySparkAdditionalFilesArray.length > 0) {
            for (String pySparkAdditionalFile : pySparkAdditionalFilesArray) {
                pySparkLauncher = pySparkLauncher.addPyFile(pySparkAdditionalFile);
                logger.info("Additional python file set to: {}", new Object[] { pySparkAdditionalFile });
            }
        }

        if (sparkMaster.equals("yarn")) {
            pySparkLauncher = pySparkLauncher.setDeployMode(sparkYarnDeployMode);
            logger.info("YARN deploy mode set to: {}", new Object[] { sparkYarnDeployMode });
        }

        pySparkLauncher = pySparkLauncher.setSparkHome(sparkHome)
                .setConf(SparkLauncher.DRIVER_MEMORY, driverMemory)
                .setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory)
                .setConf(CONFIG_PROP_SPARK_EXECUTOR_INSTANCES, executorInstances)
                .setConf(SparkLauncher.EXECUTOR_CORES, executorCores)
                .setConf(CONFIG_PROP_SPARK_NETWORK_TIMEOUT, networkTimeout);

        logger.info("Spark home set to: {} ", new Object[] { sparkHome });
        logger.info("Driver memory set to: {} ", new Object[] { driverMemory });
        logger.info("Executor memory set to: {} ", new Object[] { executorMemory });
        logger.info("Executor instances set to: {} ", new Object[] { executorInstances });
        logger.info("Executor cores set to: {} ", new Object[] { executorCores });
        logger.info("Network timeout set to: {} ", new Object[] { networkTimeout });

        if (kerberosEnabled) {
            pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_PRINCIPAL, kerberosPrincipal);
            pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_KEYTAB, kerberosKeyTab);
            logger.info("Kerberos principal set to: {} ", new Object[] { kerberosPrincipal });
            logger.info("Kerberos keytab set to: {} ", new Object[] { kerberosKeyTab });
        }

        if (!StringUtils.isEmpty(yarnQueue)) {
            pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_QUEUE, yarnQueue);
            logger.info("YARN queue set to: {} ", new Object[] { yarnQueue });
        }

        if (additionalSparkConfigOptionsArray != null && additionalSparkConfigOptionsArray.length > 0) {
            for (String additionalSparkConfigOption : additionalSparkConfigOptionsArray) {
                String[] confKeyValue = additionalSparkConfigOption.split("=");
                if (confKeyValue.length == 2) {
                    pySparkLauncher = pySparkLauncher.setConf(confKeyValue[0], confKeyValue[1]);
                    logger.info("Spark additional config option set to: {}={}",
                            new Object[] { confKeyValue[0], confKeyValue[1] });
                }
            }
        }

        logger.info("Starting execution of PySpark job");
        Process pySparkProcess = pySparkLauncher.launch();

        InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO,
                logger, pySparkProcess.getInputStream());
        Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input");
        inputThread.start();

        InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO,
                logger, pySparkProcess.getErrorStream());
        Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error");
        errorThread.start();

        logger.info("Waiting for PySpark job to complete");

        int exitCode = pySparkProcess.waitFor();
        if (exitCode != 0) {
            logger.info("Finished execution of PySpark job [FAILURE] [Status code: {}]",
                    new Object[] { exitCode });
            session.transfer(flowFile, REL_FAILURE);
        } else {
            logger.info("Finished execution of PySpark job [SUCCESS] [Status code: {}]",
                    new Object[] { exitCode });
            session.transfer(flowFile, REL_SUCCESS);
        }
    } catch (final Exception e) {
        logger.error("Unable to execute PySpark job [FAILURE]", new Object[] { flowFile, e });
        session.transfer(flowFile, REL_FAILURE);
    }
}

From source file:com.uber.hoodie.cli.commands.CommitsCommand.java

License:Apache License

@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(
        @CliOption(key = { "commit" }, help = "Commit to rollback") final String commitTime,
        @CliOption(key = {/*  ww w  .ja  va  2  s .com*/
                "sparkProperties" }, help = "Spark Properites File Path") final String sparkPropertiesPath)
        throws Exception {
    HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline();
    HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
    HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);

    if (!timeline.containsInstant(commitInstant)) {
        return "Commit " + commitTime + " not found in Commits " + timeline;
    }

    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime,
            HoodieCLI.tableMetadata.getBasePath());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    // Refresh the current
    refreshCommits();
    if (exitCode != 0) {
        return "Commit " + commitTime + " failed to roll back";
    }
    return "Commit " + commitTime + " rolled back";
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction schedule", help = "Schedule Compaction")
public String scheduleCompact(
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory") final String sparkMemory)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);/*w w  w  .  j a  va 2 s.co m*/

    // First get a compaction instant time and pass it to spark launcher for scheduling compaction
    String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime();

    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
        SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
        sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(),
                HoodieCLI.tableMetadata.getBasePath(), HoodieCLI.tableMetadata.getTableConfig().getTableName(),
                compactionInstantTime, sparkMemory);
        Process process = sparkLauncher.launch();
        InputStreamConsumer.captureOutput(process);
        int exitCode = process.waitFor();
        if (exitCode != 0) {
            return "Failed to run compaction for " + compactionInstantTime;
        }
        return "Compaction successfully completed for " + compactionInstantTime;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")
public String compact(@CliOption(key = {
        "parallelism" }, mandatory = true, help = "Parallelism for hoodie compaction") final String parallelism,
        @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String schemaFilePath,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory,
        @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry,
        @CliOption(key = "compactionInstant", mandatory = true, help = "Base path for the target hoodie dataset") final String compactionInstantTime)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);/*ww  w  . j a  v  a  2 s .  c o m*/

    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
        SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
        sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(),
                HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism,
                schemaFilePath, sparkMemory, retry);
        Process process = sparkLauncher.launch();
        InputStreamConsumer.captureOutput(process);
        int exitCode = process.waitFor();
        if (exitCode != 0) {
            return "Failed to run compaction for " + compactionInstantTime;
        }
        return "Compaction successfully completed for " + compactionInstantTime;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction validate", help = "Validate Compaction")
public String validateCompaction(
        @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
        @CliOption(key = {/*from w  ww  . ja  v  a 2  s. co m*/
                "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = null;
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism,
                    master, sparkMemory);
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to validate compaction for " + compactionInstant;
            }
            List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            boolean valid = res.stream().map(r -> r.isSuccess()).reduce(Boolean::logicalAnd).orElse(true);
            String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n";
            List<Comparable[]> rows = new ArrayList<>();
            res.stream().forEach(r -> {
                Comparable[] row = new Comparable[] { r.getOperation().getFileId(),
                        r.getOperation().getBaseInstantTime(),
                        r.getOperation().getDataFilePath().isPresent()
                                ? r.getOperation().getDataFilePath().get()
                                : "",
                        r.getOperation().getDeltaFilePaths().size(), r.isSuccess(),
                        r.getException().isPresent() ? r.getException().get().getMessage() : "" };
                rows.add(row);
            });

            Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
            TableHeader header = new TableHeader().addTableHeaderField("File Id")
                    .addTableHeaderField("Base Instant Time").addTableHeaderField("Base Data File")
                    .addTableHeaderField("Num Delta Files").addTableHeaderField("Valid")
                    .addTableHeaderField("Error");

            output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending,
                    limit, headerOnly, rows);
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction")
public String unscheduleCompaction(
        @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
        @CliOption(key = {/*from w ww .jav a  2s .c  o  m*/
                "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = {
                "skipValidation" }, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
        @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = "";
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism,
                    master, sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to unschedule compaction for " + compactionInstant;
            }
            List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
                    "unschedule pending compaction");
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId")
public String unscheduleCompactFile(
        @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = {/*from   w  w w . ja v  a  2 s  . c om*/
                "skipValidation" }, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV,
        @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);

    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = "";
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master, sparkMemory,
                    Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString());
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to unschedule compaction for file " + fileId;
            }
            List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
                    "unschedule file from pending compaction");
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}

From source file:com.uber.hoodie.cli.commands.CompactionCommand.java

License:Apache License

@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as "
        + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.")
public String repairCompaction(
        @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant,
        @CliOption(key = {/*from  w  w  w.j a  v  a  2 s  .c o  m*/
                "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism,
        @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master,
        @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory,
        @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun,
        @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit,
        @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField,
        @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending,
        @CliOption(key = {
                "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly)
        throws Exception {
    boolean initialized = HoodieCLI.initConf();
    HoodieCLI.initFS(initialized);
    String outputPathStr = getTmpSerializerFile();
    Path outputPath = new Path(outputPathStr);
    String output = "";
    if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) {
        try {
            String sparkPropertiesPath = Utils.getDefaultPropertiesFile(
                    scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties()));
            SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
            sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(),
                    HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism,
                    master, sparkMemory, Boolean.valueOf(dryRun).toString());
            Process process = sparkLauncher.launch();
            InputStreamConsumer.captureOutput(process);
            int exitCode = process.waitFor();
            if (exitCode != 0) {
                return "Failed to unschedule compaction for " + compactionInstant;
            }
            List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs);
            output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly,
                    "repair compaction");
        } finally {
            // Delete tmp file used to serialize result
            if (HoodieCLI.fs.exists(outputPath)) {
                HoodieCLI.fs.delete(outputPath, false);
            }
        }
        return output;
    } else {
        throw new Exception("Compactions can only be run for table type : MERGE_ON_READ");
    }
}