org.pentaho.amazon.AbstractAmazonJobExecutor.java Source code

Introduction

Here is the source code for org.pentaho.amazon.AbstractAmazonJobExecutor.java
Source

/*! ******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.amazon;

import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.commons.vfs2.FileSystemOptions;
import org.apache.commons.vfs2.auth.StaticUserAuthenticator;
import org.apache.commons.vfs2.impl.DefaultFileSystemConfigBuilder;
import org.pentaho.amazon.client.ClientFactoriesManager;
import org.pentaho.amazon.client.ClientType;
import org.pentaho.amazon.client.api.EmrClient;
import org.pentaho.amazon.client.api.S3Client;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.Result;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.logging.Log4jFileAppender;
import org.pentaho.di.core.logging.LogWriter;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;

import java.io.File;
import java.io.IOException;

/**
 * Created by Aliaksandr_Zhuk on 1/31/2018.
 */
public abstract class AbstractAmazonJobExecutor extends AbstractAmazonJobEntry {

    private static Class<?> PKG = AbstractAmazonJobExecutor.class;

    private Log4jFileAppender appender = null;
    private S3Client s3Client;
    protected EmrClient emrClient;
    protected String key;
    protected int numInsts = 2;

    public void setupLogFile() {
        String logFileName = "pdi-" + this.getName();
        try {
            appender = LogWriter.createFileAppender(logFileName, true, false);
            LogWriter.getInstance().addAppender(appender);
            log.setLogLevel(parentJob.getLogLevel());
        } catch (Exception e) {
            logError(BaseMessages.getString(PKG, "AbstractAmazonJobExecutor.FailedToOpenLogFile", logFileName,
                    e.toString()));
            logError(Const.getStackTracker(e));
        }
    }

    public String getStagingBucketName() throws FileSystemException, KettleException {

        String bucketName = "";

        String pathToStagingDir = getS3FileObjectPath();
        bucketName = pathToStagingDir.substring(1, pathToStagingDir.length()).split("/")[0];

        return bucketName;
    }

    private String getS3FileObjectPath() throws FileSystemException, KettleFileException {
        FileSystemOptions opts = new FileSystemOptions();
        DefaultFileSystemConfigBuilder.getInstance().setUserAuthenticator(opts,
                new StaticUserAuthenticator(null, getAWSAccessKeyId(), getAWSSecretKey()));
        FileObject stagingDirFileObject = KettleVFS.getFileObject(stagingDir, getVariables(), opts);

        return stagingDirFileObject.getName().getPath();
    }

    private String getKeyFromS3StagingDir() throws KettleFileException, FileSystemException {

        String pathToStagingDir = getS3FileObjectPath();
        StringBuilder sb = new StringBuilder(pathToStagingDir);

        sb.replace(0, 1, "");
        if (sb.indexOf("/") == -1) {
            return null;
        }
        sb.replace(0, sb.indexOf("/") + 1, "");

        if (sb.length() > 0) {
            return sb.toString();
        } else {
            return null;
        }
    }

    protected void setS3BucketKey(FileObject stagingFile) throws KettleFileException, FileSystemException {

        String keyFromStagingDir = getKeyFromS3StagingDir();

        if (keyFromStagingDir == null) {
            keyFromStagingDir = "";
        }

        StringBuilder sb = new StringBuilder(keyFromStagingDir);
        if (sb.length() > 0) {
            sb.append("/");
        }
        sb.append(stagingFile.getName().getBaseName());
        key = sb.toString();
    }

    public String getStagingS3BucketUrl(String stagingBucketName) {
        return "s3://" + stagingBucketName;
    }

    public String getStagingS3FileUrl(String stagingBucketName) {
        return "s3://" + stagingBucketName + "/" + key;
    }

    public String buildFilename(String filename) {
        filename = environmentSubstitute(filename);
        return filename;
    }

    public abstract File createStagingFile() throws IOException, KettleException;

    public abstract String getStepBootstrapActions();

    public abstract String getMainClass() throws Exception;

    public abstract String getStepType();

    private void runNewJobFlow(String stagingS3FileUrl, String stagingS3BucketUrl) throws Exception {
        emrClient.runJobFlow(stagingS3FileUrl, stagingS3BucketUrl, getStepType(), getMainClass(),
                getStepBootstrapActions(), this);
    }

    private void addStepToExistingJobFlow(String stagingS3FileUrl, String stagingS3BucketUrl) throws Exception {
        emrClient.addStepToExistingJobFlow(stagingS3FileUrl, stagingS3BucketUrl, getStepType(), getMainClass(),
                this);
    }

    private void logError(String stagingBucketName, String stepId) {
        logError(s3Client.readStepLogsFromS3(stagingBucketName, hadoopJobFlowId, stepId));
    }

    private void initAmazonClients() {
        ClientFactoriesManager manager = ClientFactoriesManager.getInstance();
        s3Client = manager.createClient(getAWSAccessKeyId(), getAWSSecretKey(), region, ClientType.S3);
        emrClient = manager.createClient(getAWSAccessKeyId(), getAWSSecretKey(), region, ClientType.EMR);
    }

    @Override
    public Result execute(Result result, int arg1) throws KettleException {

        setupLogFile();

        try {
            initAmazonClients();

            String stagingBucketName = getStagingBucketName();
            String stagingS3BucketUrl = getStagingS3BucketUrl(stagingBucketName);

            s3Client.createBucketIfNotExists(stagingBucketName);

            File tmpFile = createStagingFile();

            // delete old jar if needed
            try {
                s3Client.deleteObjectFromBucket(stagingBucketName, key);
            } catch (Exception ex) {
                logError(Const.getStackTracker(ex));
            }

            // put jar in s3 staging bucket
            s3Client.putObjectInBucket(stagingBucketName, key, tmpFile);
            String stagingS3FileUrl = getStagingS3FileUrl(stagingBucketName);

            if (runOnNewCluster) {
                // Determine the instances for Hadoop cluster.
                String numInstancesS = environmentSubstitute(numInstances);
                try {
                    numInsts = Integer.parseInt(numInstancesS);
                } catch (NumberFormatException e) {
                    logError(BaseMessages.getString(PKG, "AbstractAmazonJobExecutor.InstanceNumber.Error",
                            numInstancesS));
                }
                runNewJobFlow(stagingS3FileUrl, stagingS3BucketUrl);
                hadoopJobFlowId = emrClient.getHadoopJobFlowId();
            } else {
                addStepToExistingJobFlow(stagingS3FileUrl, stagingS3BucketUrl);
            }
            // Set a logging interval.
            String loggingIntervalS = environmentSubstitute(loggingInterval);
            int logIntv = 10;
            try {
                logIntv = Integer.parseInt(loggingIntervalS);
            } catch (NumberFormatException ex) {
                logError(BaseMessages.getString(PKG, "AbstractAmazonJobExecutor.LoggingInterval.Error",
                        loggingIntervalS));
            }
            // monitor and log if intended.
            if (blocking) {
                try {
                    if (log.isBasic()) {
                        while (emrClient.isRunning()) {

                            if (isJobStoppedByUser()) {
                                setResultError(result);
                                break;
                            }

                            if (emrClient.getCurrentClusterState() == null
                                    || emrClient.getCurrentClusterState().isEmpty()) {
                                break;
                            }
                            logBasic(hadoopJobName + " "
                                    + BaseMessages.getString(PKG,
                                            "AbstractAmazonJobExecutor.JobFlowExecutionStatus", hadoopJobFlowId)
                                    + emrClient.getCurrentClusterState() + " ");

                            logBasic(hadoopJobName
                                    + " " + BaseMessages.getString(PKG,
                                            "AbstractAmazonJobExecutor.JobFlowStepStatus", emrClient.getStepId())
                                    + emrClient.getCurrentStepState() + " ");

                            try {
                                Thread.sleep(logIntv * 1000);
                            } catch (InterruptedException ie) {
                                logError(Const.getStackTracker(ie));
                            }
                        }

                        if (emrClient.isClusterTerminated() && emrClient.isStepNotSuccess()) {
                            setResultError(result);
                            logError(hadoopJobName + " "
                                    + BaseMessages.getString(PKG,
                                            "AbstractAmazonJobExecutor.JobFlowExecutionStatus", hadoopJobFlowId)
                                    + emrClient.getCurrentClusterState());
                        }

                        if (emrClient.isStepNotSuccess()) {
                            setResultError(result);
                            logBasic(hadoopJobName
                                    + " " + BaseMessages.getString(PKG,
                                            "AbstractAmazonJobExecutor.JobFlowStepStatus", emrClient.getStepId())
                                    + emrClient.getCurrentStepState() + " ");

                            if (emrClient.isStepFailed()) {
                                logError(emrClient.getJobFlowLogUri(), emrClient.getStepId());
                            }
                        }
                    }
                } catch (Exception e) {
                    logError(e.getMessage(), e);
                }
            }
        } catch (Throwable t) {
            t.printStackTrace();
            setResultError(result);
            logError(t.getMessage(), t);
        }

        if (appender != null) {
            LogWriter.getInstance().removeAppender(appender);
            appender.close();

            ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(),
                    parentJob.getJobname(), getName());
            result.getResultFiles().put(resultFile.getFile().toString(), resultFile);
        }
        return result;
    }

    private boolean isJobStoppedByUser() {
        if (getParentJob().isInterrupted() || getParentJob().isStopped()) {
            return emrClient.stopSteps();
        }
        return false;
    }

    private void setResultError(Result result) {
        result.setStopped(true);
        result.setNrErrors(1);
        result.setResult(false);
    }
}