com.streamsets.datacollector.cluster.EmrClusterProvider.java Source code

Java tutorial

Introduction

Here is the source code for com.streamsets.datacollector.cluster.EmrClusterProvider.java

Source

/*
 * Copyright 2018 StreamSets Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.streamsets.datacollector.cluster;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import com.streamsets.datacollector.config.PipelineConfiguration;
import com.streamsets.datacollector.creation.PipelineConfigBean;
import com.streamsets.datacollector.main.RuntimeInfo;
import com.streamsets.datacollector.security.SecurityConfiguration;
import com.streamsets.datacollector.stagelibrary.StageLibraryTask;
import com.streamsets.datacollector.util.Configuration;
import com.streamsets.datacollector.validation.Issue;
import com.streamsets.pipeline.api.StageException;
import com.streamsets.pipeline.api.delegate.exported.ClusterJob;
import com.streamsets.pipeline.api.impl.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeoutException;

public class EmrClusterProvider extends BaseClusterProvider {

    private final static Logger LOG = LoggerFactory.getLogger(EmrClusterProvider.class);

    public EmrClusterProvider(RuntimeInfo runtimeInfo, SecurityConfiguration securityConfiguration,
            Configuration conf, StageLibraryTask stageLibraryTask) {
        super(runtimeInfo, securityConfiguration, conf, stageLibraryTask);
    }

    @Override
    public void killPipeline(File tempDir, ApplicationState applicationState,
            PipelineConfiguration pipelineConfiguration, PipelineConfigBean pipelineConfigBean)
            throws TimeoutException, IOException, StageException {
        Utils.checkNotNull(applicationState.getEmrConfig(), "EMR cluster config");
        Properties emrJobProps = applicationState.getEmrConfig();
        ClusterJob.Client clusterJobClient = getClusterJobDelegator(pipelineConfiguration)
                .getClient(pipelineConfigBean.amazonEMRConfig.convertToProperties());
        Properties emrStateProps = clusterJobClient.getJobStatus(emrJobProps);
        String appId = emrStateProps.getProperty("appId");
        if (appId == null) {
            throw new RuntimeException("Cannot retrieve the Yarn application Id from EMR cluster");
        }
        emrJobProps.setProperty("appId", appId);
        clusterJobClient.terminateJob(emrJobProps);
    }

    @Override
    public ClusterPipelineStatus getStatus(File tempDir, ApplicationState applicationState,
            PipelineConfiguration pipelineConfiguration, PipelineConfigBean pipelineConfigBean)
            throws TimeoutException, IOException, StageException {
        Utils.checkNotNull(applicationState.getEmrConfig(), "EMR cluster config");
        Properties emrJobProps = applicationState.getEmrConfig();
        ClusterJob.Client clusterJobClient = getClusterJobDelegator(pipelineConfiguration)
                .getClient(pipelineConfigBean.amazonEMRConfig.convertToProperties());
        Properties emrStateProps = clusterJobClient.getClusterStatus(emrJobProps.getProperty("clusterId"));
        ClusterPipelineStatus clusterPipelineStatus = EmrStatusParser.parseClusterStatus(emrStateProps);
        if (clusterPipelineStatus.equals(ClusterPipelineStatus.RUNNING)) {
            emrStateProps = clusterJobClient.getJobStatus(emrJobProps);
            clusterPipelineStatus = EmrStatusParser.parseJobStatus(emrStateProps);
        }
        return clusterPipelineStatus;
    }

    @Override
    public void cleanUp(ApplicationState applicationState, PipelineConfiguration pipelineConfiguration,
            PipelineConfigBean pipelineConfigBean) throws IOException, StageException {
        Utils.checkNotNull(applicationState.getEmrConfig(), "EMR cluster config");
        Properties emrJobProps = applicationState.getEmrConfig();
        ClusterJob.Client clusterJobClient = getClusterJobDelegator(pipelineConfiguration)
                .getClient(pipelineConfigBean.amazonEMRConfig.convertToProperties());

        // we only terminate the cluster if we created the cluster
        if (pipelineConfigBean.amazonEMRConfig.provisionNewCluster
                && pipelineConfigBean.amazonEMRConfig.terminateCluster) {
            clusterJobClient.terminateCluster(emrJobProps.getProperty("clusterId"));
        }
        clusterJobClient.deleteJobFiles(emrJobProps);
    }

    @Override
    protected ApplicationState startPipelineExecute(File outputDir, Map<String, String> sourceInfo,
            PipelineConfiguration pipelineConfiguration, PipelineConfigBean pipelineConfigBean,
            long timeToWaitForFailure, File stagingDir, String clusterToken, File clusterBootstrapJar,
            File bootstrapJar, Set<String> jarsToShip, File libsTarGz, File resourcesTarGz, File etcTarGz,
            File sdcPropertiesFile, File log4jProperties, String mesosHostingJarDir, String mesosURL,
            String clusterBootstrapApiJar, List<Issue> errors) throws IOException, StageException {
        jarsToShip = Sets.newHashSet(jarsToShip);
        jarsToShip.add(bootstrapJar.getAbsolutePath());
        File stagingDriverJar = new File(stagingDir, new File(clusterBootstrapApiJar).getName());
        copyFile(new File(clusterBootstrapApiJar), stagingDriverJar);
        // Sdc configs need to be read by Hadoop Driver. This file cannot be on S3 as hadoop driver
        // (cluster-bootstrap-api) cannot have extra
        // dependency on S3 client. So add the property files to the cluster driver jar.
        replaceFileInJar(stagingDriverJar.getAbsolutePath(), sdcPropertiesFile.getAbsolutePath());

        Properties clusterJobProps = pipelineConfigBean.amazonEMRConfig.convertToProperties();
        Properties jobProps = new Properties();

        jobProps.setProperty("pipelineId", pipelineConfiguration.getPipelineId());
        jobProps.setProperty("uniquePrefix", UUID.randomUUID().toString());
        jobProps.setProperty("jobName", pipelineConfiguration.getTitle());

        ClusterJob.Client clusterJobClient = getClusterJobDelegator(pipelineConfiguration)
                .getClient(clusterJobProps);

        String clusterId;
        if (pipelineConfigBean.amazonEMRConfig.provisionNewCluster) {
            String clusterName = getEmrClusterName(pipelineConfigBean.amazonEMRConfig.clusterPrefix,
                    getRuntimeInfo().getId(), pipelineConfiguration.getPipelineId());
            clusterId = clusterJobClient.getActiveCluster(clusterName);
            if (clusterId == null) {
                clusterId = clusterJobClient.createCluster(clusterName);
                LOG.info("Starting EMR cluster, id is {}", clusterId);
            }
        } else {
            clusterId = pipelineConfigBean.amazonEMRConfig.clusterId;
        }
        jobProps.setProperty("clusterId", clusterId);
        ApplicationState applicationState = new ApplicationState();
        boolean isError = false;
        try {
            String driverJarS3 = clusterJobClient.uploadJobFiles(jobProps, ImmutableList.of(stagingDriverJar))
                    .get(0);

            List<String> archivesUri = clusterJobClient.uploadJobFiles(jobProps,
                    ImmutableList.of(libsTarGz, resourcesTarGz, etcTarGz));

            List<String> libJarsS3Uris = clusterJobClient.uploadJobFiles(jobProps,
                    ImmutableList.copyOf(jarsToShip.stream().map(jarToShip -> new File(jarToShip)).iterator()));

            jobProps.setProperty("libjars", Joiner.on(",").join(libJarsS3Uris));
            jobProps.setProperty("archives", Joiner.on(",").join(archivesUri));
            jobProps.setProperty("driverJarPath", driverJarS3);
            jobProps.setProperty("driverMainClass", "com.streamsets.pipeline.BootstrapEmrBatch");
            jobProps.setProperty("javaopts",
                    Joiner.on(" ").join(String.format("-Xmx%sm", pipelineConfigBean.clusterSlaveMemory),
                            pipelineConfigBean.clusterSlaveJavaOpts));
            jobProps.setProperty("logLevel", pipelineConfigBean.logLevel.getLabel());

            LOG.info("Submitting job to cluster: {}", clusterId);
            jobProps = clusterJobClient.submitJob(jobProps);
        } catch (Exception e) {
            isError = true;
            String msg = Utils.format("Submission failed due to: {}", e);
            LOG.error(msg, e);
            throw new IOException(msg, e);
        } finally {
            applicationState.setEmrConfig(jobProps);
            if (isError) {
                cleanUp(applicationState, pipelineConfiguration, pipelineConfigBean);
            }
        }
        return applicationState;
    }

    private String getEmrClusterName(String clusterPrefix, String sdcId, String pipelineId) {
        return clusterPrefix + "::" + sdcId + "::" + pipelineId;
    }

    void replaceFileInJar(String absolutePath, String fileToBeCopied) throws IOException {
        Map<String, String> env = new HashMap<>();
        env.put("create", "true");
        URI uri = URI.create("jar:file:" + absolutePath);

        try (FileSystem zipfs = FileSystems.newFileSystem(uri, env)) {
            Path externalTxtFile = Paths.get(fileToBeCopied);
            Path pathInZipfile = zipfs.getPath("cluster_sdc.properties");
            Files.copy(externalTxtFile, pathInZipfile, StandardCopyOption.REPLACE_EXISTING);
        }
    }

    void copyFile(File origin, File target) throws IOException {
        try (InputStream is = new FileInputStream(origin); OutputStream os = new FileOutputStream(target)) {
            org.apache.commons.io.IOUtils.copy(is, os);
        }
    }

}