Java tutorial
/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.amazon.emr.job; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; import org.apache.commons.io.IOUtils; import org.apache.commons.vfs.FileObject; import org.apache.commons.vfs.FileSystemOptions; import org.apache.commons.vfs.auth.StaticUserAuthenticator; import org.apache.commons.vfs.impl.DefaultFileSystemConfigBuilder; import org.pentaho.amazon.AbstractAmazonJobEntry; import org.pentaho.di.cluster.SlaveServer; import org.pentaho.di.core.Const; import org.pentaho.di.core.Result; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.annotations.JobEntry; import org.pentaho.di.core.database.DatabaseMeta; import org.pentaho.di.core.encryption.Encr; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleXMLException; import org.pentaho.di.core.hadoop.HadoopConfigurationBootstrap; import org.pentaho.di.core.logging.Log4jFileAppender; import org.pentaho.di.core.logging.LogWriter; import org.pentaho.di.core.util.StringUtil; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.core.xml.XMLHandler; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.job.entries.hadoopjobexecutor.JarUtility; import org.pentaho.di.job.entry.JobEntryInterface; import org.pentaho.di.repository.ObjectId; import org.pentaho.di.repository.Repository; import org.pentaho.hadoop.shim.spi.HadoopShim; import org.w3c.dom.Node; import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient; import com.amazonaws.services.elasticmapreduce.model.AddJobFlowStepsRequest; import com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsRequest; import com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsResult; import com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig; import com.amazonaws.services.elasticmapreduce.model.JobFlowDetail; import com.amazonaws.services.elasticmapreduce.model.JobFlowInstancesConfig; import com.amazonaws.services.elasticmapreduce.model.RunJobFlowRequest; import com.amazonaws.services.elasticmapreduce.model.RunJobFlowResult; import com.amazonaws.services.elasticmapreduce.model.StepConfig; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.PutObjectRequest; import com.amazonaws.services.s3.model.S3Object; @JobEntry(id = "EMRJobExecutorPlugin", image = "EMR.svg", name = "EMRJobExecutorPlugin.Name", description = "EMRJobExecutorPlugin.Description", categoryDescription = "i18n:org.pentaho.di.job:JobCategory.Category.BigData", i18nPackageName = "org.pentaho.amazon.emr.job") public class AmazonElasticMapReduceJobExecutor extends AbstractAmazonJobEntry implements Cloneable, JobEntryInterface { private static Class<?> PKG = AmazonElasticMapReduceJobExecutor.class; // for i18n purposes, needed by Translator2!! // $NON-NLS-1$ private JarUtility util = new JarUtility(); public AmazonElasticMapReduceJobExecutor() { } public String getMainClass(URL localJarUrl) throws Exception { HadoopShim shim = HadoopConfigurationBootstrap.getHadoopConfigurationProvider().getActiveConfiguration() .getHadoopShim(); final Class<?> mainClass = util.getMainClassFromManifest(localJarUrl, shim.getClass().getClassLoader()); if (mainClass != null) { return mainClass.getName(); } else { List<Class<?>> classesWithMains = util.getClassesInJarWithMain(localJarUrl.toExternalForm(), shim.getClass().getClassLoader()); if (!classesWithMains.isEmpty()) { return classesWithMains.get(0).getName(); } } throw new RuntimeException("Could not find main class in: " + localJarUrl.toExternalForm()); } public Result execute(Result result, int arg1) throws KettleException { Log4jFileAppender appender = null; String logFileName = "pdi-" + this.getName(); //$NON-NLS-1$ try { appender = LogWriter.createFileAppender(logFileName, true, false); LogWriter.getInstance().addAppender(appender); log.setLogLevel(parentJob.getLogLevel()); } catch (Exception e) { logError(BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.FailedToOpenLogFile", //$NON-NLS-1$ logFileName, e.toString())); logError(Const.getStackTracker(e)); } try { // create/connect aws service AmazonElasticMapReduceClient emrClient = new AmazonElasticMapReduceClient(awsCredentials); // pull down jar from vfs FileObject jarFile = KettleVFS.getFileObject(buildFilename(jarUrl)); File tmpFile = File.createTempFile("customEMR", "jar"); tmpFile.deleteOnExit(); FileOutputStream tmpFileOut = new FileOutputStream(tmpFile); IOUtils.copy(jarFile.getContent().getInputStream(), tmpFileOut); URL localJarUrl = tmpFile.toURI().toURL(); // find main class in jar String mainClass = getMainClass(localJarUrl); // create staging bucket AmazonS3 s3Client = new AmazonS3Client(awsCredentials); FileSystemOptions opts = new FileSystemOptions(); DefaultFileSystemConfigBuilder.getInstance().setUserAuthenticator(opts, new StaticUserAuthenticator( null, awsCredentials.getAWSAccessKeyId(), awsCredentials.getAWSSecretKey())); FileObject stagingDirFileObject = KettleVFS.getFileObject(stagingDir, getVariables(), opts); String stagingBucketName = stagingDirFileObject.getName().getBaseName(); if (!s3Client.doesBucketExist(stagingBucketName)) { s3Client.createBucket(stagingBucketName); } // delete old jar if needed try { s3Client.deleteObject(stagingBucketName, jarFile.getName().getBaseName()); } catch (Exception ex) { logError(Const.getStackTracker(ex)); } // put jar in s3 staging bucket s3Client.putObject(new PutObjectRequest(stagingBucketName, jarFile.getName().getBaseName(), tmpFile)); // create non-vfs s3 url to jar String stagingS3JarUrl = "s3://" + stagingBucketName + "/" + jarFile.getName().getBaseName(); String stagingS3BucketUrl = "s3://" + stagingBucketName; RunJobFlowRequest runJobFlowRequest = null; RunJobFlowResult runJobFlowResult = null; if (StringUtil.isEmpty(hadoopJobFlowId)) { // create EMR job flow runJobFlowRequest = createJobFlow(stagingS3BucketUrl, stagingS3JarUrl, mainClass); // start EMR job runJobFlowResult = emrClient.runJobFlow(runJobFlowRequest); } else { List<String> jarStepArgs = new ArrayList<String>(); if (!StringUtil.isEmpty(cmdLineArgs)) { StringTokenizer st = new StringTokenizer(cmdLineArgs, " "); while (st.hasMoreTokens()) { String token = st.nextToken(); logBasic("adding args: " + token); jarStepArgs.add(token); } } HadoopJarStepConfig hadoopJarStep = new HadoopJarStepConfig(); hadoopJarStep.setJar(stagingS3JarUrl); hadoopJarStep.setMainClass(mainClass); hadoopJarStep.setArgs(jarStepArgs); StepConfig stepConfig = new StepConfig(); stepConfig.setName("custom jar: " + jarUrl); stepConfig.setHadoopJarStep(hadoopJarStep); List<StepConfig> steps = new ArrayList<StepConfig>(); steps.add(stepConfig); AddJobFlowStepsRequest addJobFlowStepsRequest = new AddJobFlowStepsRequest(); addJobFlowStepsRequest.setJobFlowId(hadoopJobFlowId); addJobFlowStepsRequest.setSteps(steps); emrClient.addJobFlowSteps(addJobFlowStepsRequest); } String loggingIntervalS = environmentSubstitute(loggingInterval); int logIntv = 60; try { logIntv = Integer.parseInt(loggingIntervalS); } catch (NumberFormatException ex) { logError("Unable to parse logging interval '" + loggingIntervalS + "' - using " + "default of 60"); } // monitor it / blocking / logging if desired if (blocking) { try { if (log.isBasic()) { String executionState = "RUNNING"; List<String> jobFlowIds = new ArrayList<String>(); String id = hadoopJobFlowId; if (StringUtil.isEmpty(hadoopJobFlowId)) { id = runJobFlowResult.getJobFlowId(); jobFlowIds.add(id); } while (isRunning(executionState)) { DescribeJobFlowsRequest describeJobFlowsRequest = new DescribeJobFlowsRequest(); describeJobFlowsRequest.setJobFlowIds(jobFlowIds); DescribeJobFlowsResult describeJobFlowsResult = emrClient .describeJobFlows(describeJobFlowsRequest); boolean found = false; for (JobFlowDetail jobFlowDetail : describeJobFlowsResult.getJobFlows()) { if (jobFlowDetail.getJobFlowId().equals(id)) { executionState = jobFlowDetail.getExecutionStatusDetail().getState(); found = true; } } if (!found) { break; } // logBasic(BaseMessages.getString(PKG, "AmazonElasticMapReduceJobExecutor.RunningPercent", setupPercent, // mapPercent, reducePercent)); logBasic(hadoopJobName + " execution status: " + executionState); try { if (isRunning(executionState)) { Thread.sleep(logIntv * 1000); } } catch (InterruptedException ie) { // Ignore } } if ("FAILED".equalsIgnoreCase(executionState)) { result.setStopped(true); result.setNrErrors(1); result.setResult(false); S3Object outObject = s3Client.getObject(stagingBucketName, id + "/steps/1/stdout"); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); IOUtils.copy(outObject.getObjectContent(), outStream); logError(outStream.toString()); S3Object errorObject = s3Client.getObject(stagingBucketName, id + "/steps/1/stderr"); ByteArrayOutputStream errorStream = new ByteArrayOutputStream(); IOUtils.copy(errorObject.getObjectContent(), errorStream); logError(errorStream.toString()); } } } catch (Exception e) { logError(e.getMessage(), e); } } } catch (Throwable t) { t.printStackTrace(); result.setStopped(true); result.setNrErrors(1); result.setResult(false); logError(t.getMessage(), t); } if (appender != null) { LogWriter.getInstance().removeAppender(appender); appender.close(); ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(), parentJob.getJobname(), getName()); result.getResultFiles().put(resultFile.getFile().toString(), resultFile); } return result; } public RunJobFlowRequest createJobFlow(String stagingS3BucketUrl, String stagingS3Jar, String mainClass) { List<String> jarStepArgs = new ArrayList<String>(); if (!StringUtil.isEmpty(cmdLineArgs)) { StringTokenizer st = new StringTokenizer(cmdLineArgs, " "); while (st.hasMoreTokens()) { String token = st.nextToken(); logBasic("adding args: " + token); jarStepArgs.add(token); } } HadoopJarStepConfig hadoopJarStep = new HadoopJarStepConfig(); hadoopJarStep.setJar(stagingS3Jar); hadoopJarStep.setMainClass(mainClass); hadoopJarStep.setArgs(jarStepArgs); StepConfig stepConfig = new StepConfig(); stepConfig.setName("custom jar: " + jarUrl); stepConfig.setHadoopJarStep(hadoopJarStep); List<StepConfig> steps = new ArrayList<StepConfig>(); steps.add(stepConfig); String numInstancesS = environmentSubstitute(numInstances); int numInsts = 2; try { numInsts = Integer.parseInt(numInstancesS); } catch (NumberFormatException e) { logError("Unable to parse number of instances to use '" + numInstancesS + "' - " + "using 2 instances..."); } JobFlowInstancesConfig instances = new JobFlowInstancesConfig(); instances.setInstanceCount(numInsts); instances.setMasterInstanceType(getInstanceType(masterInstanceType)); instances.setSlaveInstanceType(getInstanceType(slaveInstanceType)); instances.setHadoopVersion("0.20"); RunJobFlowRequest runJobFlowRequest = new RunJobFlowRequest(); runJobFlowRequest.setSteps(steps); runJobFlowRequest.setLogUri(stagingS3BucketUrl); runJobFlowRequest.setName(hadoopJobName); runJobFlowRequest.setInstances(instances); // ScriptBootstrapActionConfig scriptBootstrapAction = new ScriptBootstrapActionConfig(); // scriptBootstrapAction.setPath("s3://mddwordcount/bootstrap.sh"); // List<String> bootstrapArgs = new ArrayList<String>(); // bootstrapArgs.add("http://pdi-node-dist.s3.amazonaws.com"); // // // bootstrapArgs.add( // "http://ci.pentaho.com/view/Data%20Integration/job/Kettle/lastSuccessfulBuild/artifact/Kettle/"); // bootstrapArgs.add("pdi-hadoop-node-TRUNK-SNAPSHOT.zip"); // scriptBootstrapAction.setArgs(bootstrapArgs); // BootstrapActionConfig bootstrapActionConfig = new BootstrapActionConfig(); // bootstrapActionConfig.setName("mdd bootstrap"); // bootstrapActionConfig.setScriptBootstrapAction(scriptBootstrapAction); // List<BootstrapActionConfig> bootstrapActions = new ArrayList<BootstrapActionConfig>(); // bootstrapActions.add(bootstrapActionConfig); // runJobFlowRequest.setBootstrapActions(bootstrapActions); return runJobFlowRequest; } public static String getInstanceType(String unparsedInstanceType) { return unparsedInstanceType.substring(unparsedInstanceType.lastIndexOf("[") + 1, unparsedInstanceType.lastIndexOf("]")); } public static boolean isRunning(String state) { // * <b>Pattern: </b>COMPLETED|FAILED|TERMINATED|RUNNING|SHUTTING_DOWN|STARTING|WAITING|BOOTSTRAPPING<br/> if ("COMPLETED".equalsIgnoreCase(state)) { return false; } if ("FAILED".equalsIgnoreCase(state)) { return false; } if ("TERMINATED".equalsIgnoreCase(state)) { return false; } return true; } public void loadXML(Node entrynode, List<DatabaseMeta> databases, List<SlaveServer> slaveServers, Repository rep) throws KettleXMLException { super.loadXML(entrynode, databases, slaveServers); hadoopJobName = XMLHandler.getTagValue(entrynode, "hadoop_job_name"); hadoopJobFlowId = XMLHandler.getTagValue(entrynode, "hadoop_job_flow_id"); jarUrl = XMLHandler.getTagValue(entrynode, "jar_url"); accessKey = Encr.decryptPasswordOptionallyEncrypted(XMLHandler.getTagValue(entrynode, "access_key")); secretKey = Encr.decryptPasswordOptionallyEncrypted(XMLHandler.getTagValue(entrynode, "secret_key")); stagingDir = XMLHandler.getTagValue(entrynode, "staging_dir"); // numInstances = Integer.parseInt(XMLHandler.getTagValue(entrynode, "num_instances")); numInstances = XMLHandler.getTagValue(entrynode, "num_instances"); masterInstanceType = XMLHandler.getTagValue(entrynode, "master_instance_type"); slaveInstanceType = XMLHandler.getTagValue(entrynode, "slave_instance_type"); cmdLineArgs = XMLHandler.getTagValue(entrynode, "command_line_args"); blocking = "Y".equalsIgnoreCase(XMLHandler.getTagValue(entrynode, "blocking")); /* * try { loggingInterval = Integer.parseInt(XMLHandler.getTagValue(entrynode, "logging_interval")); } catch * (NumberFormatException nfe) { } */ loggingInterval = XMLHandler.getTagValue(entrynode, "logging_interval"); } public String getXML() { StringBuffer retval = new StringBuffer(1024); retval.append(super.getXML()); retval.append(" ").append(XMLHandler.addTagValue("hadoop_job_name", hadoopJobName)); retval.append(" ").append(XMLHandler.addTagValue("hadoop_job_flow_id", hadoopJobFlowId)); retval.append(" ").append(XMLHandler.addTagValue("jar_url", jarUrl)); retval.append(" ") .append(XMLHandler.addTagValue("access_key", Encr.encryptPasswordIfNotUsingVariables(accessKey))); retval.append(" ") .append(XMLHandler.addTagValue("secret_key", Encr.encryptPasswordIfNotUsingVariables(secretKey))); retval.append(" ").append(XMLHandler.addTagValue("staging_dir", stagingDir)); retval.append(" ").append(XMLHandler.addTagValue("num_instances", numInstances)); retval.append(" ").append(XMLHandler.addTagValue("master_instance_type", masterInstanceType)); retval.append(" ").append(XMLHandler.addTagValue("slave_instance_type", slaveInstanceType)); retval.append(" ").append(XMLHandler.addTagValue("command_line_args", cmdLineArgs)); retval.append(" ").append(XMLHandler.addTagValue("blocking", blocking)); retval.append(" ").append(XMLHandler.addTagValue("logging_interval", loggingInterval)); retval.append(" ").append(XMLHandler.addTagValue("hadoop_job_name", hadoopJobName)); return retval.toString(); } public void loadRep(Repository rep, ObjectId id_jobentry, List<DatabaseMeta> databases, List<SlaveServer> slaveServers) throws KettleException { if (rep != null) { super.loadRep(rep, id_jobentry, databases, slaveServers); setHadoopJobName(rep.getJobEntryAttributeString(id_jobentry, "hadoop_job_name")); setHadoopJobFlowId(rep.getJobEntryAttributeString(id_jobentry, "hadoop_job_flow_id")); setJarUrl(rep.getJobEntryAttributeString(id_jobentry, "jar_url")); setAccessKey(Encr .decryptPasswordOptionallyEncrypted(rep.getJobEntryAttributeString(id_jobentry, "access_key"))); setSecretKey(Encr .decryptPasswordOptionallyEncrypted(rep.getJobEntryAttributeString(id_jobentry, "secret_key"))); setStagingDir(rep.getJobEntryAttributeString(id_jobentry, "staging_dir")); // setNumInstances(new Long(rep.getJobEntryAttributeInteger(id_jobentry, "num_instances")).intValue()); setNumInstances(rep.getJobEntryAttributeString(id_jobentry, "num_instances")); setMasterInstanceType(rep.getJobEntryAttributeString(id_jobentry, "master_instance_type")); setSlaveInstanceType(rep.getJobEntryAttributeString(id_jobentry, "slave_instance_type")); setCmdLineArgs(rep.getJobEntryAttributeString(id_jobentry, "command_line_args")); setBlocking(rep.getJobEntryAttributeBoolean(id_jobentry, "blocking")); // setLoggingInterval(new Long(rep.getJobEntryAttributeInteger(id_jobentry, "logging_interval")).intValue()); setLoggingInterval(rep.getJobEntryAttributeString(id_jobentry, "logging_interval")); } else { throw new KettleException("Unable to save to a repository. The repository is null."); //$NON-NLS-1$ } } public void saveRep(Repository rep, ObjectId id_job) throws KettleException { if (rep != null) { super.saveRep(rep, id_job); rep.saveJobEntryAttribute(id_job, getObjectId(), "hadoop_job_name", hadoopJobName); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "hadoop_job_flow_id", hadoopJobFlowId); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "jar_url", jarUrl); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "secret_key", //$NON-NLS-1$ Encr.encryptPasswordIfNotUsingVariables(secretKey)); rep.saveJobEntryAttribute(id_job, getObjectId(), "access_key", //$NON-NLS-1$ Encr.encryptPasswordIfNotUsingVariables(accessKey)); rep.saveJobEntryAttribute(id_job, getObjectId(), "staging_dir", stagingDir); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "num_instances", numInstances); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "master_instance_type", masterInstanceType); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "slave_instance_type", slaveInstanceType); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "command_line_args", cmdLineArgs); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "blocking", blocking); //$NON-NLS-1$ rep.saveJobEntryAttribute(id_job, getObjectId(), "logging_interval", loggingInterval); //$NON-NLS-1$ } else { throw new KettleException("Unable to save to a repository. The repository is null."); //$NON-NLS-1$ } } public String buildFilename(String filename) { filename = environmentSubstitute(filename); return filename; } public boolean evaluates() { return true; } public boolean isUnconditional() { return true; } @Override public String getDialogClassName() { String className = getClass().getCanonicalName(); className = className.replaceFirst("\\.job\\.", ".ui."); className += "Dialog"; return className; } }