List of usage examples for org.apache.spark.launcher SparkLauncher launch
public Process launch() throws IOException
From source file:com.cloudera.livy.client.local.ContextLauncher.java
License:Apache License
private static ChildProcess startDriver(final RpcServer rpcServer, final LocalConf conf, final String clientId, final String secret, final String className) throws IOException { final String serverAddress = rpcServer.getAddress(); final String serverPort = String.valueOf(rpcServer.getPort()); if (conf.get(CLIENT_IN_PROCESS) != null) { // Mostly for testing things quickly. Do not do this in production. LOG.warn("!!!! Running remote driver in-process. !!!!"); Runnable child = new Runnable() { @Override//w ww. j av a 2s . c om public void run() { List<String> args = new ArrayList<>(); args.add("--remote-host"); args.add(serverAddress); args.add("--remote-port"); args.add(serverPort); args.add("--client-id"); args.add(clientId); args.add("--secret"); args.add(secret); for (Map.Entry<String, String> e : conf) { args.add("--conf"); args.add(String.format("%s=%s", e.getKey(), e.getValue())); } try { RemoteDriver.main(args.toArray(new String[args.size()])); } catch (Exception e) { LOG.error("Error running driver.", e); } } }; return new ChildProcess(conf, child); } else { // If a Spark installation is provided, use the spark-submit script. Otherwise, call the // SparkSubmit class directly, which has some caveats (like having to provide a proper // version of Guava on the classpath depending on the deploy mode). final SparkLauncher launcher = new SparkLauncher(); String sparkHome = conf.get(SPARK_HOME_KEY); if (sparkHome == null) { sparkHome = System.getenv(SPARK_HOME_ENV); } if (sparkHome == null) { sparkHome = System.getProperty(SPARK_HOME_KEY); } launcher.setSparkHome(sparkHome); conf.set(CLIENT_ID, clientId); conf.set(CLIENT_SECRET, secret); launcher.setAppResource("spark-internal"); String livyJars = conf.get(LIVY_JARS); if (livyJars == null) { String livyHome = System.getenv("LIVY_HOME"); Preconditions.checkState(livyHome != null, "Need one of LIVY_HOME or %s set.", LIVY_JARS.key()); File clientJars = new File(livyHome, "client-jars"); Preconditions.checkState(clientJars.isDirectory(), "Cannot find 'client-jars' directory under LIVY_HOME."); List<String> jars = new ArrayList<>(); for (File f : clientJars.listFiles()) { jars.add(f.getAbsolutePath()); } livyJars = Joiner.on(",").join(jars); } String userJars = conf.get(SPARK_JARS_KEY); if (userJars != null) { String allJars = Joiner.on(",").join(livyJars, userJars); conf.set(SPARK_JARS_KEY, allJars); } else { conf.set(SPARK_JARS_KEY, livyJars); } // Disable multiple attempts since the RPC server doesn't yet support multiple // connections for the same registered app. conf.set("spark.yarn.maxAppAttempts", "1"); File confFile = writeConfToFile(conf); // Define how to pass options to the child process. If launching in client (or local) // mode, the driver options need to be passed directly on the command line. Otherwise, // SparkSubmit will take care of that for us. String master = conf.get("spark.master"); Preconditions.checkArgument(master != null, "spark.master is not defined."); launcher.setMaster(master); launcher.setPropertiesFile(confFile.getAbsolutePath()); launcher.setMainClass(className); if (conf.get(PROXY_USER) != null) { launcher.addSparkArg("--proxy-user", conf.get(PROXY_USER)); } launcher.addAppArgs("--remote-host", serverAddress); launcher.addAppArgs("--remote-port", serverPort); return new ChildProcess(conf, launcher.launch()); } }
From source file:com.cloudera.livy.rsc.ContextLauncher.java
License:Apache License
private static ChildProcess startDriver(final RSCConf conf, Promise<?> promise) throws IOException { String livyJars = conf.get(LIVY_JARS); if (livyJars == null) { String livyHome = System.getenv("LIVY_HOME"); Utils.checkState(livyHome != null, "Need one of LIVY_HOME or %s set.", LIVY_JARS.key()); File rscJars = new File(livyHome, "rsc-jars"); if (!rscJars.isDirectory()) { rscJars = new File(livyHome, "rsc/target/jars"); }/*w ww.jav a 2 s .c o m*/ Utils.checkState(rscJars.isDirectory(), "Cannot find 'client-jars' directory under LIVY_HOME."); List<String> jars = new ArrayList<>(); for (File f : rscJars.listFiles()) { jars.add(f.getAbsolutePath()); } livyJars = Utils.join(jars, ","); } merge(conf, SPARK_JARS_KEY, livyJars, ","); String kind = conf.get(SESSION_KIND); if ("sparkr".equals(kind)) { merge(conf, SPARK_ARCHIVES_KEY, conf.get(RSCConf.Entry.SPARKR_PACKAGE), ","); } else if ("pyspark".equals(kind)) { merge(conf, "spark.submit.pyFiles", conf.get(RSCConf.Entry.PYSPARK_ARCHIVES), ","); } // Disable multiple attempts since the RPC server doesn't yet support multiple // connections for the same registered app. conf.set("spark.yarn.maxAppAttempts", "1"); // Let the launcher go away when launcher in yarn cluster mode. This avoids keeping lots // of "small" Java processes lingering on the Livy server node. conf.set("spark.yarn.submit.waitAppCompletion", "false"); // For testing; propagate jacoco settings so that we also do coverage analysis // on the launched driver. We replace the name of the main file ("main.exec") // so that we don't end up fighting with the main test launcher. String jacocoArgs = System.getProperty("jacoco.args"); if (jacocoArgs != null) { jacocoArgs = jacocoArgs.replace("main.exec", "child.exec"); merge(conf, SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, jacocoArgs, " "); } final File confFile = writeConfToFile(conf); if (conf.getBoolean(CLIENT_IN_PROCESS)) { // Mostly for testing things quickly. Do not do this in production. LOG.warn("!!!! Running remote driver in-process. !!!!"); Runnable child = new Runnable() { @Override public void run() { try { RSCDriverBootstrapper.main(new String[] { confFile.getAbsolutePath() }); } catch (Exception e) { throw Utils.propagate(e); } } }; return new ChildProcess(conf, promise, child, confFile); } else { final SparkLauncher launcher = new SparkLauncher(); launcher.setSparkHome(System.getenv(SPARK_HOME_ENV)); launcher.setAppResource("spark-internal"); launcher.setPropertiesFile(confFile.getAbsolutePath()); launcher.setMainClass(RSCDriverBootstrapper.class.getName()); if (conf.get(PROXY_USER) != null) { launcher.addSparkArg("--proxy-user", conf.get(PROXY_USER)); } return new ChildProcess(conf, promise, launcher.launch(), confFile); } }
From source file:com.thinkbiganalytics.nifi.pyspark.core.ExecutePySpark.java
License:Apache License
@Override public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException { final ComponentLog logger = getLog(); FlowFile flowFile = session.get();// w w w .j a v a 2 s. c o m if (flowFile == null) { flowFile = session.create(); logger.info("Created a flow file having uuid: {}", new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) }); } else { logger.info("Using an existing flow file having uuid: {}", new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) }); } try { final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue(); final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue(); final String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES) .getValue(); final String pySparkAppFile = context.getProperty(PYSPARK_APP_FILE) .evaluateAttributeExpressions(flowFile).getValue(); final String pySparkAppArgs = context.getProperty(PYSPARK_APP_ARGS) .evaluateAttributeExpressions(flowFile).getValue(); final String pySparkAppName = context.getProperty(PYSPARK_APP_NAME) .evaluateAttributeExpressions(flowFile).getValue(); final String pySparkAdditionalFiles = context.getProperty(PYSPARK_ADDITIONAL_FILES) .evaluateAttributeExpressions(flowFile).getValue(); final String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile) .getValue().trim().toLowerCase(); final String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE) .evaluateAttributeExpressions(flowFile).getValue(); final String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile) .getValue(); final String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile) .getValue(); final String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile) .getValue(); final String executorMemory = context.getProperty(EXECUTOR_MEMORY) .evaluateAttributeExpressions(flowFile).getValue(); final String executorInstances = context.getProperty(EXECUTOR_INSTANCES) .evaluateAttributeExpressions(flowFile).getValue(); final String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile) .getValue(); final String networkTimeout = context.getProperty(NETWORK_TIMEOUT) .evaluateAttributeExpressions(flowFile).getValue(); final String additionalSparkConfigOptions = context.getProperty(ADDITIONAL_SPARK_CONFIG_OPTIONS) .evaluateAttributeExpressions(flowFile).getValue(); PySparkUtils pySparkUtils = new PySparkUtils(); /* Get app arguments */ String[] pySparkAppArgsArray = null; if (!StringUtils.isEmpty(pySparkAppArgs)) { pySparkAppArgsArray = pySparkUtils.getCsvValuesAsArray(pySparkAppArgs); logger.info("Provided application arguments: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) }); } /* Get additional python files */ String[] pySparkAdditionalFilesArray = null; if (!StringUtils.isEmpty(pySparkAdditionalFiles)) { pySparkAdditionalFilesArray = pySparkUtils.getCsvValuesAsArray(pySparkAdditionalFiles); logger.info("Provided python files: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAdditionalFilesArray) }); } /* Get additional config key-value pairs */ String[] additionalSparkConfigOptionsArray = null; if (!StringUtils.isEmpty(additionalSparkConfigOptions)) { additionalSparkConfigOptionsArray = pySparkUtils.getCsvValuesAsArray(additionalSparkConfigOptions); logger.info("Provided spark config options: {}", new Object[] { pySparkUtils.getCsvStringFromArray(additionalSparkConfigOptionsArray) }); } /* Determine if Kerberos is enabled */ boolean kerberosEnabled = false; if (!StringUtils.isEmpty(kerberosPrincipal) && !StringUtils.isEmpty(kerberosKeyTab) && !StringUtils.isEmpty(hadoopConfigurationResources)) { kerberosEnabled = true; logger.info("Kerberos is enabled"); } /* For Kerberized cluster, attempt user authentication */ if (kerberosEnabled) { logger.info("Attempting user authentication for Kerberos"); ApplySecurityPolicy applySecurityObject = new ApplySecurityPolicy(); Configuration configuration; try { logger.info("Getting Hadoop configuration from " + hadoopConfigurationResources); configuration = ApplySecurityPolicy.getConfigurationFromResources(hadoopConfigurationResources); if (SecurityUtil.isSecurityEnabled(configuration)) { logger.info("Security is enabled"); if (kerberosPrincipal.equals("") && kerberosKeyTab.equals("")) { logger.error( "Kerberos Principal and Keytab provided with empty values for a Kerberized cluster."); session.transfer(flowFile, REL_FAILURE); return; } try { logger.info("User authentication initiated"); boolean authenticationStatus = applySecurityObject.validateUserWithKerberos(logger, hadoopConfigurationResources, kerberosPrincipal, kerberosKeyTab); if (authenticationStatus) { logger.info("User authenticated successfully."); } else { logger.error("User authentication failed."); session.transfer(flowFile, REL_FAILURE); return; } } catch (Exception unknownException) { logger.error("Unknown exception occurred while validating user :" + unknownException.getMessage()); session.transfer(flowFile, REL_FAILURE); return; } } } catch (IOException e1) { logger.error("Unknown exception occurred while authenticating user :" + e1.getMessage()); session.transfer(flowFile, REL_FAILURE); return; } } /* Build and launch PySpark Job */ logger.info("Configuring PySpark job for execution"); SparkLauncher pySparkLauncher = new SparkLauncher().setAppResource(pySparkAppFile); logger.info("PySpark app file set to: {}", new Object[] { pySparkAppFile }); if (pySparkAppArgsArray != null && pySparkAppArgsArray.length > 0) { pySparkLauncher = pySparkLauncher.addAppArgs(pySparkAppArgsArray); logger.info("App arguments set to: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) }); } pySparkLauncher = pySparkLauncher.setAppName(pySparkAppName).setMaster(sparkMaster); logger.info("App name set to: {}", new Object[] { pySparkAppName }); logger.info("Spark master set to: {}", new Object[] { sparkMaster }); if (pySparkAdditionalFilesArray != null && pySparkAdditionalFilesArray.length > 0) { for (String pySparkAdditionalFile : pySparkAdditionalFilesArray) { pySparkLauncher = pySparkLauncher.addPyFile(pySparkAdditionalFile); logger.info("Additional python file set to: {}", new Object[] { pySparkAdditionalFile }); } } if (sparkMaster.equals("yarn")) { pySparkLauncher = pySparkLauncher.setDeployMode(sparkYarnDeployMode); logger.info("YARN deploy mode set to: {}", new Object[] { sparkYarnDeployMode }); } pySparkLauncher = pySparkLauncher.setSparkHome(sparkHome) .setConf(SparkLauncher.DRIVER_MEMORY, driverMemory) .setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory) .setConf(CONFIG_PROP_SPARK_EXECUTOR_INSTANCES, executorInstances) .setConf(SparkLauncher.EXECUTOR_CORES, executorCores) .setConf(CONFIG_PROP_SPARK_NETWORK_TIMEOUT, networkTimeout); logger.info("Spark home set to: {} ", new Object[] { sparkHome }); logger.info("Driver memory set to: {} ", new Object[] { driverMemory }); logger.info("Executor memory set to: {} ", new Object[] { executorMemory }); logger.info("Executor instances set to: {} ", new Object[] { executorInstances }); logger.info("Executor cores set to: {} ", new Object[] { executorCores }); logger.info("Network timeout set to: {} ", new Object[] { networkTimeout }); if (kerberosEnabled) { pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_PRINCIPAL, kerberosPrincipal); pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_KEYTAB, kerberosKeyTab); logger.info("Kerberos principal set to: {} ", new Object[] { kerberosPrincipal }); logger.info("Kerberos keytab set to: {} ", new Object[] { kerberosKeyTab }); } if (!StringUtils.isEmpty(yarnQueue)) { pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_QUEUE, yarnQueue); logger.info("YARN queue set to: {} ", new Object[] { yarnQueue }); } if (additionalSparkConfigOptionsArray != null && additionalSparkConfigOptionsArray.length > 0) { for (String additionalSparkConfigOption : additionalSparkConfigOptionsArray) { String[] confKeyValue = additionalSparkConfigOption.split("="); if (confKeyValue.length == 2) { pySparkLauncher = pySparkLauncher.setConf(confKeyValue[0], confKeyValue[1]); logger.info("Spark additional config option set to: {}={}", new Object[] { confKeyValue[0], confKeyValue[1] }); } } } logger.info("Starting execution of PySpark job"); Process pySparkProcess = pySparkLauncher.launch(); InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, pySparkProcess.getInputStream()); Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input"); inputThread.start(); InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, pySparkProcess.getErrorStream()); Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error"); errorThread.start(); logger.info("Waiting for PySpark job to complete"); int exitCode = pySparkProcess.waitFor(); if (exitCode != 0) { logger.info("Finished execution of PySpark job [FAILURE] [Status code: {}]", new Object[] { exitCode }); session.transfer(flowFile, REL_FAILURE); } else { logger.info("Finished execution of PySpark job [SUCCESS] [Status code: {}]", new Object[] { exitCode }); session.transfer(flowFile, REL_SUCCESS); } } catch (final Exception e) { logger.error("Unable to execute PySpark job [FAILURE]", new Object[] { flowFile, e }); session.transfer(flowFile, REL_FAILURE); } }
From source file:com.uber.hoodie.cli.commands.CommitsCommand.java
License:Apache License
@CliCommand(value = "commit rollback", help = "Rollback a commit") public String rollbackCommit( @CliOption(key = { "commit" }, help = "Commit to rollback") final String commitTime, @CliOption(key = {/* ww w .ja va 2 s .com*/ "sparkProperties" }, help = "Spark Properites File Path") final String sparkPropertiesPath) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.tableMetadata.getActiveTimeline(); HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime); if (!timeline.containsInstant(commitInstant)) { return "Commit " + commitTime + " not found in Commits " + timeline; } SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), commitTime, HoodieCLI.tableMetadata.getBasePath()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); // Refresh the current refreshCommits(); if (exitCode != 0) { return "Commit " + commitTime + " failed to roll back"; } return "Commit " + commitTime + " rolled back"; }
From source file:com.uber.hoodie.cli.commands.CompactionCommand.java
License:Apache License
@CliCommand(value = "compaction schedule", help = "Schedule Compaction") public String scheduleCompact( @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", help = "Spark executor memory") final String sparkMemory) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized);/*w w w . j a va 2 s.co m*/ // First get a compaction instant time and pass it to spark launcher for scheduling compaction String compactionInstantTime = HoodieActiveTimeline.createNewCommitTime(); if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE.toString(), HoodieCLI.tableMetadata.getBasePath(), HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, sparkMemory); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to run compaction for " + compactionInstantTime; } return "Compaction successfully completed for " + compactionInstantTime; } else { throw new Exception("Compactions can only be run for table type : MERGE_ON_READ"); } }
From source file:com.uber.hoodie.cli.commands.CompactionCommand.java
License:Apache License
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time") public String compact(@CliOption(key = { "parallelism" }, mandatory = true, help = "Parallelism for hoodie compaction") final String parallelism, @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String schemaFilePath, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory, @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry, @CliOption(key = "compactionInstant", mandatory = true, help = "Base path for the target hoodie dataset") final String compactionInstantTime) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized);/*ww w . j a v a 2 s . c o m*/ if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_RUN.toString(), HoodieCLI.tableMetadata.getBasePath(), HoodieCLI.tableMetadata.getTableConfig().getTableName(), compactionInstantTime, parallelism, schemaFilePath, sparkMemory, retry); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to run compaction for " + compactionInstantTime; } return "Compaction successfully completed for " + compactionInstantTime; } else { throw new Exception("Compactions can only be run for table type : MERGE_ON_READ"); } }
From source file:com.uber.hoodie.cli.commands.CompactionCommand.java
License:Apache License
@CliCommand(value = "compaction validate", help = "Validate Compaction") public String validateCompaction( @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, @CliOption(key = {/*from w ww . ja v a 2 s. co m*/ "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output = null; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_VALIDATE.toString(), HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, sparkMemory); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to validate compaction for " + compactionInstant; } List<ValidationOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); boolean valid = res.stream().map(r -> r.isSuccess()).reduce(Boolean::logicalAnd).orElse(true); String message = "\n\n\t COMPACTION PLAN " + (valid ? "VALID" : "INVALID") + "\n\n"; List<Comparable[]> rows = new ArrayList<>(); res.stream().forEach(r -> { Comparable[] row = new Comparable[] { r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(), r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "", r.getOperation().getDeltaFilePaths().size(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : "" }; rows.add(row); }); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); TableHeader header = new TableHeader().addTableHeaderField("File Id") .addTableHeaderField("Base Instant Time").addTableHeaderField("Base Data File") .addTableHeaderField("Num Delta Files").addTableHeaderField("Valid") .addTableHeaderField("Error"); output = message + HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; } else { throw new Exception("Compactions can only be run for table type : MERGE_ON_READ"); } }
From source file:com.uber.hoodie.cli.commands.CompactionCommand.java
License:Apache License
@CliCommand(value = "compaction unschedule", help = "Unschedule Compaction") public String unscheduleCompaction( @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, @CliOption(key = {/*from w ww .jav a 2s .c o m*/ "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = { "skipValidation" }, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV, @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output = ""; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_PLAN.toString(), HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to unschedule compaction for " + compactionInstant; } List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule pending compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; } else { throw new Exception("Compactions can only be run for table type : MERGE_ON_READ"); } }
From source file:com.uber.hoodie.cli.commands.CompactionCommand.java
License:Apache License
@CliCommand(value = "compaction unscheduleFileId", help = "UnSchedule Compaction for a fileId") public String unscheduleCompactFile( @CliOption(key = "fileId", mandatory = true, help = "File Id") final String fileId, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = {/*from w w w . ja v a 2 s . c om*/ "skipValidation" }, help = "skip validation", unspecifiedDefaultValue = "false") boolean skipV, @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = { "headeronly" }, help = "Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output = ""; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_UNSCHEDULE_FILE.toString(), HoodieCLI.tableMetadata.getBasePath(), fileId, outputPathStr, "1", master, sparkMemory, Boolean.valueOf(skipV).toString(), Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to unschedule compaction for file " + fileId; } List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "unschedule file from pending compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; } else { throw new Exception("Compactions can only be run for table type : MERGE_ON_READ"); } }
From source file:com.uber.hoodie.cli.commands.CompactionCommand.java
License:Apache License
@CliCommand(value = "compaction repair", help = "Renames the files to make them consistent with the timeline as " + "dictated by Hoodie metadata. Use when compaction unschedule fails partially.") public String repairCompaction( @CliOption(key = "instant", mandatory = true, help = "Compaction Instant") String compactionInstant, @CliOption(key = {/*from w w w.j a v a 2 s .c o m*/ "parallelism" }, unspecifiedDefaultValue = "3", help = "Parallelism") String parallelism, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master ") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "2G", help = "executor memory") String sparkMemory, @CliOption(key = { "dryRun" }, help = "Dry Run Mode", unspecifiedDefaultValue = "false") boolean dryRun, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") boolean headerOnly) throws Exception { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); String outputPathStr = getTmpSerializerFile(); Path outputPath = new Path(outputPathStr); String output = ""; if (HoodieCLI.tableMetadata.getTableType() == HoodieTableType.MERGE_ON_READ) { try { String sparkPropertiesPath = Utils.getDefaultPropertiesFile( scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkCommand.COMPACT_REPAIR.toString(), HoodieCLI.tableMetadata.getBasePath(), compactionInstant, outputPathStr, parallelism, master, sparkMemory, Boolean.valueOf(dryRun).toString()); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); if (exitCode != 0) { return "Failed to unschedule compaction for " + compactionInstant; } List<RenameOpResult> res = deSerializeOperationResult(outputPathStr, HoodieCLI.fs); output = getRenamesToBePrinted(res, limit, sortByField, descending, headerOnly, "repair compaction"); } finally { // Delete tmp file used to serialize result if (HoodieCLI.fs.exists(outputPath)) { HoodieCLI.fs.delete(outputPath, false); } } return output; } else { throw new Exception("Compactions can only be run for table type : MERGE_ON_READ"); } }