fr.ens.biologie.genomique.eoulsan.util.cloud.AWSElasticMapReduceJob.java Source code

Java tutorial

Introduction

Here is the source code for fr.ens.biologie.genomique.eoulsan.util.cloud.AWSElasticMapReduceJob.java

Source

/*
 *                  Eoulsan development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public License version 2.1 or
 * later and CeCILL-C. This should be distributed with the code.
 * If you do not have a copy, see:
 *
 *      http://www.gnu.org/licenses/lgpl-2.1.txt
 *      http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
 *
 * Copyright for this code is held jointly by the Genomic platform
 * of the Institut de Biologie de l'cole normale suprieure and
 * the individual authors. These should be listed in @author doc
 * comments.
 *
 * For more information on the Eoulsan project and its aims,
 * or to join the Eoulsan Google group, visit the home page
 * at:
 *
 *      http://outils.genomique.biologie.ens.fr/eoulsan
 *
 */

package fr.ens.biologie.genomique.eoulsan.util.cloud;

import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static java.util.Objects.requireNonNull;

import java.util.Objects;

import com.amazonaws.AmazonClientException;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.ec2.model.InstanceType;
import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce;
import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient;
import com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig;
import com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsRequest;
import com.amazonaws.services.elasticmapreduce.model.DescribeJobFlowsResult;
import com.amazonaws.services.elasticmapreduce.model.HadoopJarStepConfig;
import com.amazonaws.services.elasticmapreduce.model.JobFlowDetail;
import com.amazonaws.services.elasticmapreduce.model.JobFlowExecutionStatusDetail;
import com.amazonaws.services.elasticmapreduce.model.JobFlowInstancesConfig;
import com.amazonaws.services.elasticmapreduce.model.RunJobFlowRequest;
import com.amazonaws.services.elasticmapreduce.model.RunJobFlowResult;
import com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig;
import com.amazonaws.services.elasticmapreduce.model.StepConfig;
import com.amazonaws.services.elasticmapreduce.util.StepFactory;

/**
 * This class define an AWS Elastic MapReduce job.
 * @since 1.0
 * @author Laurent Jourdren
 */
public class AWSElasticMapReduceJob {

    private static final int MAX_FAIL_COUNT = 5;

    /** Version of hadoop to use with AWS Elastic MapReduce. */
    private String hadoopVersion = "0.20";

    /** Number of instance to use with AWS Elastic MapReduce. */
    private int nInstances = -1;

    /** Type of instance to use with AWS Elastic MapReduce master. */
    private String masterInstanceType;

    /** Type of instance to use with AWS Elastic MapReduce slaves. */
    private String slavesInstanceType = InstanceType.M1Xlarge.toString();

    /** End point to use with AWS Elastic MapReduce. */
    private String endpoint = "eu-west-1.elasticmapreduce.amazonaws.com";

    /** Log path to use with AWS Elastic MapReduce. */
    private String logPathname;

    /** Path to jar file. */
    private String jarLocation;

    /** Path to jar arguments. */
    private String[] jarArguments;

    /** Job Flow name. */
    private String jobFlowName;

    /** AWS Access key. */
    private String AWSAccessKey;

    /** AWS secret key. */
    private String AWSSecretKey;

    /** Maximal number of map tasks in a tasktracker. */
    private int taskTrackerMaxMapTasks;

    /** EC2 Key pair name to use. */
    private String ec2KeyName;

    /** Enable debugging. */
    private boolean enableDebugging;

    private RunJobFlowRequest runFlowRequest;
    private RunJobFlowResult runFlowResult;
    private AmazonElasticMapReduce elasticMapReduceClient;

    //
    // Getters
    //

    /**
     * Get the hadoop version.
     * @return Returns the hadoopVersion
     */
    public String getHadoopVersion() {
        return this.hadoopVersion;
    }

    /**
     * Set the number of instances.
     * @return Returns the nInstances
     */
    public int getInstancesNumber() {
        return this.nInstances;
    }

    /**
     * Get the type of the master instance.
     * @return Returns the instanceType of the master
     */
    public String getMasterInstanceType() {
        return this.masterInstanceType;
    }

    /**
     * Get the type of the slave instances.
     * @return Returns the instanceType
     */
    public String getSlavesInstanceType() {
        return this.slavesInstanceType;
    }

    /**
     * Get the endpoint.
     * @return Returns the endpoint
     */
    public String getEndpoint() {
        return this.endpoint;
    }

    /**
     * Get the log path.
     * @return Returns the logPathname
     */
    public String getLogPathname() {
        return this.logPathname;
    }

    /**
     * Get the jar location.
     * @return Returns the jar location
     */
    public String getJarLocation() {
        return this.jarLocation;
    }

    /**
     * Get the jar arguments.
     * @return Returns the jar arguments
     */
    public String[] getJarArguments() {
        return this.jarArguments == null ? null : this.jarArguments.clone();
    }

    /**
     * Get the job flow name.
     * @return Returns the job flow name
     */
    public String getJobFlowName() {
        return this.jobFlowName;
    }

    /**
     * Get the AWS access key.
     * @return Returns AWS access key
     */
    public String getAWSAccessKey() {
        return this.AWSAccessKey;
    }

    /**
     * Get the AWS secret key.
     * @return Returns AWS secret key
     */
    public String getAWSSecretKey() {
        return this.AWSSecretKey;
    }

    /**
     * Get the number of maximal mapper tasks to use in a task tracker.
     * @return the number of maximal mapper tasks to use in a task tracker
     */
    public int getTaskTrackerMaxMapTasks() {
        return this.taskTrackerMaxMapTasks;
    }

    /**
     * Return the EC2 Key pair name to use.
     * @return EC2 Key pair name to use
     */
    public String getEC2KeyName() {
        return this.ec2KeyName;
    }

    /**
     * Test if debugging is enabled.
     * @return true if debugging is enabled
     */
    public boolean isDebuggingEnabled() {
        return this.enableDebugging;
    }

    //
    // Setters
    //

    /**
     * Set the hadoop version.
     * @param hadoopVersion The hadoop version to set
     */
    void setHadoopVersion(final String hadoopVersion) {
        this.hadoopVersion = hadoopVersion;
    }

    /**
     * Set the number of instance to use.
     * @param nInstances The number of instances to use
     */
    void setInstancesNumber(final int nInstances) {
        this.nInstances = nInstances;
    }

    /**
     * Set the type of master instance.
     * @param instanceType The instanceType to set for master
     */
    void setMasterInstanceType(final String instanceType) {
        this.masterInstanceType = instanceType;
    }

    /**
     * Set the type of slaves instances.
     * @param instanceType The instanceType to set
     */
    void setSlavesInstanceType(final String instanceType) {
        this.slavesInstanceType = instanceType;
    }

    /**
     * Set the endpoint.
     * @param endpoint The endpoint to set
     */
    void setEndpoint(final String endpoint) {
        this.endpoint = endpoint;
    }

    /**
     * Set the log path.
     * @param logPathname The logPathname to set
     */
    void setLogPathname(final String logPathname) {
        this.logPathname = logPathname;
    }

    /**
     * Set the jar location.
     * @param jarLocation The jar location to set
     */
    void setJarLocation(final String jarLocation) {
        this.jarLocation = jarLocation;
    }

    /**
     * Set the jar location.
     * @param jarArguments The jar arguments
     */
    void setJarArguments(final String[] jarArguments) {
        this.jarArguments = jarArguments;
    }

    /**
     * Set the job flow name.
     * @param jobFlowName The job flow name
     */
    void setJobFlowName(final String jobFlowName) {
        this.jobFlowName = jobFlowName;
    }

    /**
     * Set the AWS access key.
     * @param AWSAccessKey set AWS access key
     */
    void setAWSAccessKey(final String AWSAccessKey) {
        this.AWSAccessKey = AWSAccessKey;
    }

    /**
     * Set the AWS access key.
     * @param AWSSecretKey set AWS secret key
     */
    void setAWSSecretKey(final String AWSSecretKey) {
        this.AWSSecretKey = AWSSecretKey;
    }

    /**
     * Set the number of maximal mapper tasks to use in a task tracker.
     * @param taskTrackerMaxMapTasks the number of maximal mapper tasks to use in
     *          a task tracker
     */
    void setTaskTrackerMaxMapTasks(final int taskTrackerMaxMapTasks) {
        this.taskTrackerMaxMapTasks = taskTrackerMaxMapTasks;
    }

    /**
     * Set the EC2 Key pair name to use.
     * @param ec2KeyName EC2 Key pair name to use
     */
    void setEC2KeyName(final String ec2KeyName) {
        this.ec2KeyName = ec2KeyName;
    }

    /**
     * Set if debugging must be enabled.
     * @param enableDebugging true if debugging is enabled
     */
    public void setDebugging(final boolean enableDebugging) {
        this.enableDebugging = enableDebugging;
    }

    //
    // Other methods
    //

    void init() {

        requireNonNull(this.AWSAccessKey);
        requireNonNull(this.AWSAccessKey);
        requireNonNull(this.jarLocation);
        requireNonNull(this.jarArguments);
        requireNonNull(this.slavesInstanceType);
        requireNonNull(this.hadoopVersion);
        requireNonNull(this.jobFlowName);

        if (this.nInstances < 1) {
            throw new IllegalArgumentException("the number of instance is lower than 1");
        }

        if (this.masterInstanceType == null) {
            this.masterInstanceType = this.slavesInstanceType;
        }

        // Set the hadoop jar step
        final HadoopJarStepConfig hadoopJarStep = new HadoopJarStepConfig().withJar(this.jarLocation.trim())
                .withArgs(this.jarArguments);

        // Set step config
        final StepConfig stepConfig = new StepConfig().withName(this.jobFlowName + "-step")
                .withHadoopJarStep(hadoopJarStep).withActionOnFailure("TERMINATE_JOB_FLOW");

        // Set the instance
        final JobFlowInstancesConfig instances = new JobFlowInstancesConfig().withInstanceCount(this.nInstances)
                .withMasterInstanceType(this.masterInstanceType).withSlaveInstanceType(this.slavesInstanceType)
                .withHadoopVersion(this.hadoopVersion);

        // Configure hadoop
        final ScriptBootstrapActionConfig scriptBootstrapAction = new ScriptBootstrapActionConfig()
                .withPath("s3n://eu-west-1.elasticmapreduce/bootstrap-actions/configure-hadoop")
                .withArgs("--site-key-value",
                        "mapreduce.tasktracker.map.tasks.maximum=" + this.taskTrackerMaxMapTasks);

        final BootstrapActionConfig bootstrapActions = new BootstrapActionConfig().withName("Configure hadoop")
                .withScriptBootstrapAction(scriptBootstrapAction);

        // Enable debugging
        StepFactory stepFactory = new StepFactory();
        StepConfig enableDebugging = new StepConfig().withName("Enable Debugging")
                .withActionOnFailure("TERMINATE_JOB_FLOW").withHadoopJarStep(stepFactory.newEnableDebuggingStep());

        // Run flow
        this.runFlowRequest = new RunJobFlowRequest().withName(this.jobFlowName);

        // Enable or not debugging
        if (this.enableDebugging) {
            this.runFlowRequest.withInstances(instances).withSteps(enableDebugging, stepConfig);
        } else {
            this.runFlowRequest.withInstances(instances).withSteps(stepConfig);
        }

        // Limit the number of task in a task tracker
        if (this.taskTrackerMaxMapTasks > 0) {
            this.runFlowRequest.withBootstrapActions(bootstrapActions);
        }

        if (this.logPathname != null && !"".equals(this.logPathname)) {
            this.runFlowRequest.withLogUri(this.logPathname);
        }

        // Set EC2 Key name
        if (this.ec2KeyName != null) {
            this.runFlowRequest.getInstances().setEc2KeyName(this.ec2KeyName);
        }
    }

    /**
     * Run the job.
     * @return a the JobFlowId of the job
     */
    public String runJob() {

        // Get the credentials
        final AWSCredentials credentials = new BasicAWSCredentials(this.AWSAccessKey, this.AWSSecretKey);

        // Create the Amazon Elastic MapReduce object
        this.elasticMapReduceClient = new AmazonElasticMapReduceClient(credentials);

        // Set the end point
        this.elasticMapReduceClient.setEndpoint(this.endpoint);

        this.runFlowResult = this.elasticMapReduceClient.runJobFlow(this.runFlowRequest);

        return this.runFlowResult.getJobFlowId();
    }

    /**
     * Wait the end of the job
     * @param secondBetweenChecking number of seconds to wait between 2 checks
     * @return the final state of the job
     */
    public String waitForJob(final int secondBetweenChecking) {

        if (this.runFlowResult == null) {
            return null;
        }

        final DescribeJobFlowsRequest describeJobFlowsRequest = new DescribeJobFlowsRequest()
                .withJobFlowIds(this.runFlowResult.getJobFlowId());

        String state = null;
        String lastState = null;
        int failCount = 0;

        try {

            do {

                Thread.sleep(secondBetweenChecking * 1000);

                try {
                    final DescribeJobFlowsResult jobFlowsResult = this.elasticMapReduceClient
                            .describeJobFlows(describeJobFlowsRequest);
                    final JobFlowDetail detail = jobFlowsResult.getJobFlows().get(0);
                    final JobFlowExecutionStatusDetail executionStatusDetail = detail.getExecutionStatusDetail();
                    failCount = 0;

                    state = executionStatusDetail.getState();
                } catch (AmazonClientException ace) {

                    failCount++;
                    getLogger().warning("Amazon client exception: " + ace.getMessage());

                    if (failCount >= MAX_FAIL_COUNT) {
                        throw ace;
                    }

                }

                if (lastState == null || !lastState.equals(state)) {

                    getLogger().info("State of the job " + this.runFlowResult.getJobFlowId() + ": " + state);
                    lastState = state;
                }

            } while (state != null && !state.equals("COMPLETED") && !state.equals("FAILED")
                    && !state.equals("TERMINATED"));

            return state;

        } catch (InterruptedException e) {
            getLogger().warning("Error while waiting AWS Elastic MapReduce Job: " + e.getMessage());
        }

        return null;
    }

    //
    // Constructor
    //

    /**
     * Package constructor
     */
    AWSElasticMapReduceJob() {
    }
}