org.apache.sqoop.submission.spark.SparkSubmissionEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.sqoop.submission.spark.SparkSubmissionEngine.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.sqoop.submission.spark;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.sqoop.common.Direction;
import org.apache.sqoop.common.MapContext;
import org.apache.sqoop.common.SqoopException;
import org.apache.sqoop.connector.idf.IntermediateDataFormat;
import org.apache.sqoop.driver.JobRequest;
import org.apache.sqoop.driver.SubmissionEngine;
import org.apache.sqoop.error.code.SparkSubmissionError;
import org.apache.sqoop.execution.spark.SparkExecutionEngine;
import org.apache.sqoop.execution.spark.SparkJobRequest;
import org.apache.sqoop.execution.spark.SqoopInputFormatSpark;
import org.apache.sqoop.execution.spark.SqoopWritableListWrapper;
import org.apache.sqoop.mapredsparkcommon.MRConfigurationUtils;
import org.apache.sqoop.mapredsparkcommon.MRJobConstants;
import org.apache.sqoop.mapredsparkcommon.SqoopSplit;
import org.apache.sqoop.mapredsparkcommon.SqoopWritable;
import org.apache.sqoop.model.MSubmission;
import org.apache.sqoop.model.SubmissionError;
import org.apache.sqoop.submission.SubmissionStatus;

import java.io.File;
import java.io.FilenameFilter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.util.Map;

/**
 * This is very simple and straightforward implementation of spark based
 * submission engine.
 */
public class SparkSubmissionEngine extends SubmissionEngine {

    private static Logger LOG = Logger.getLogger(SparkSubmissionEngine.class);
    private SparkConf sparkConf;
    private JavaSparkContext sc;

    /**
     * Global configuration object that is built from hadoop configuration files
     * on engine initialization and cloned during each new submission creation.
     */
    private Configuration globalConfiguration;
    //private transient JobConf jobConf;

    /**
     * {@inheritDoc}
     */
    @Override
    public void initialize(MapContext context, String prefix) {
        super.initialize(context, prefix);
        LOG.info("Initializing Spark Submission Engine");

        // Build global configuration, start with empty configuration object
        globalConfiguration = new Configuration();
        globalConfiguration.clear();

        // Load configured hadoop configuration directory
        String configDirectory = context.getString(prefix + Constants.CONF_CONFIG_DIR);

        // Git list of files ending with "-site.xml" (configuration files)
        File dir = new File(configDirectory);
        String[] files = dir.list(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.endsWith("-site.xml");
            }
        });

        if (files == null) {
            throw new SqoopException(SparkSubmissionError.SPARK_0002,
                    "Invalid Hadoop configuration directory (not a directory or permission issues): "
                            + configDirectory);
        }

        // Add each such file to our global configuration object
        for (String file : files) {
            LOG.info("Found hadoop configuration file " + file);
            try {
                globalConfiguration.addResource(new File(configDirectory, file).toURI().toURL());
            } catch (MalformedURLException e) {
                LOG.error("Can't load configuration file: " + file, e);
            }
        }

        // Save our own property inside the job to easily identify Sqoop jobs
        globalConfiguration.setBoolean(Constants.SQOOP_JOB, true);

        // Initialize the Spark Context
        sparkConf = new SparkConf().setAppName("Sqoop on Spark").setMaster("local");
        sc = new JavaSparkContext(sparkConf);

    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void destroy() {
        super.destroy();
        LOG.info("Destroying Spark Submission Engine");
        sc.stop();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean isExecutionEngineSupported(Class<?> executionEngineClass) {
        return executionEngineClass == SparkExecutionEngine.class;
        //return true;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean submit(JobRequest sparkJobRequest) {

        //This additional setting up of configuration is to be done on each submission
        //(as in the MR engine)
        SparkJobRequest request = (SparkJobRequest) sparkJobRequest;

        // Clone global configuration
        //jackh: Check 'final' - probably added by Intellij while refactoring conf (from run() in map()) to configuration
        final Configuration configuration = new Configuration(globalConfiguration);

        // Serialize driver context into job configuration
        for (Map.Entry<String, String> entry : request.getDriverContext()) {
            if (entry.getValue() == null) {
                LOG.warn("Ignoring null driver context value for key " + entry.getKey());
                continue;
            }
            configuration.set(entry.getKey(), entry.getValue());
        }

        // Serialize connector context as a sub namespace
        for (Map.Entry<String, String> entry : request.getConnectorContext(Direction.FROM)) {
            if (entry.getValue() == null) {
                LOG.warn("Ignoring null connector context value for key " + entry.getKey());
                continue;
            }
            configuration.set(MRJobConstants.PREFIX_CONNECTOR_FROM_CONTEXT + entry.getKey(), entry.getValue());
        }

        for (Map.Entry<String, String> entry : request.getConnectorContext(Direction.TO)) {
            if (entry.getValue() == null) {
                LOG.warn("Ignoring null connector context value for key " + entry.getKey());
                continue;
            }
            configuration.set(MRJobConstants.PREFIX_CONNECTOR_TO_CONTEXT + entry.getKey(), entry.getValue());
        }

        // Promote all required jars to the job
        configuration.set("tmpjars", StringUtils.join(request.getJars(), ","));

        try {
            Job job = new Job(configuration);

            // Adding link, job and connector schema configurations to the Mapreduce configuration object instead of the
            // Hadoop credentials cache. This is because hadoop, for security reasons, does not serialize the credentials
            // cache for sending over the wire (only the Configuration object is serialized, while the credentials cache
            // resides in the JobConf object).
            // Adding this configuration information to the Configuration object and sending over the wire is a security
            // issue that must be addressed later.

            // from and to link configs
            MRConfigurationUtils.setConnectorLinkConfigUnsafe(Direction.FROM, job.getConfiguration(),
                    request.getConnectorLinkConfig(Direction.FROM));
            MRConfigurationUtils.setConnectorLinkConfigUnsafe(Direction.TO, job.getConfiguration(),
                    request.getConnectorLinkConfig(Direction.TO));

            // from and to job configs
            MRConfigurationUtils.setConnectorJobConfigUnsafe(Direction.FROM, job.getConfiguration(),
                    request.getJobConfig(Direction.FROM));
            MRConfigurationUtils.setConnectorJobConfigUnsafe(Direction.TO, job.getConfiguration(),
                    request.getJobConfig(Direction.TO));

            // driver config
            MRConfigurationUtils.setDriverConfig(job, request.getDriverConfig());

            // from and to connector configs
            MRConfigurationUtils.setConnectorSchemaUnsafe(Direction.FROM, job.getConfiguration(),
                    request.getJobSubmission().getFromSchema());
            MRConfigurationUtils.setConnectorSchemaUnsafe(Direction.TO, job.getConfiguration(),
                    request.getJobSubmission().getToSchema());

            // Retaining to minimize change to existing functioning code
            MRConfigurationUtils.setConnectorLinkConfig(Direction.FROM, job,
                    request.getConnectorLinkConfig(Direction.FROM));
            MRConfigurationUtils.setConnectorLinkConfig(Direction.TO, job,
                    request.getConnectorLinkConfig(Direction.TO));
            MRConfigurationUtils.setConnectorJobConfig(Direction.FROM, job, request.getJobConfig(Direction.FROM));
            MRConfigurationUtils.setConnectorJobConfig(Direction.TO, job, request.getJobConfig(Direction.TO));
            MRConfigurationUtils.setConnectorSchema(Direction.FROM, job,
                    request.getJobSubmission().getFromSchema());
            MRConfigurationUtils.setConnectorSchema(Direction.TO, job, request.getJobSubmission().getToSchema());

            if (request.getJobName() != null) {
                job.setJobName("Sqoop: " + request.getJobName());
            } else {
                job.setJobName("Sqoop job with id: " + request.getJobId());
            }

            job.setInputFormatClass(request.getInputFormatClass());

            job.setOutputFormatClass(request.getOutputFormatClass());
            job.setOutputKeyClass(request.getOutputKeyClass());
            job.setOutputValueClass(request.getOutputValueClass());

            // Form the initial RDD from the Hadoop configuration object set up above
            JavaPairRDD<SqoopSplit, SqoopSplit> initRDD = sc.newAPIHadoopRDD(job.getConfiguration(),
                    SqoopInputFormatSpark.class, SqoopSplit.class, SqoopSplit.class);

            // For debugging - check size of initial RDD; remove in production
            int numPartitions = initRDD.partitions().size();

            // Create SparkMapTrigger object and use it to trigger mapToPair()
            ConfigurationWrapper wrappedConf = new ConfigurationWrapper(job.getConfiguration());
            SparkMapTrigger sparkMapTriggerObj = new SparkMapTrigger(initRDD, wrappedConf);
            JavaPairRDD<IntermediateDataFormat<Object>, Integer> mappedRDD = sparkMapTriggerObj.triggerSparkMap();

            // Add reduce phase/any transformation code here
            // For debugging - check size of RDD before partitioning; remove in production
            numPartitions = mappedRDD.partitions().size();

            JavaPairRDD<IntermediateDataFormat<Object>, Integer> repartitionedRDD = null;

            // Get number of loaders, if specified
            if (request.getLoaders() != null) {
                long numLoaders = request.getLoaders();
                long numExtractors = (request.getExtractors() != null) ? (request.getExtractors())
                        : (job.getConfiguration().getLong(MRJobConstants.JOB_ETL_EXTRACTOR_NUM, 10));

                if (numLoaders > numExtractors) {
                    // Repartition the RDD: yields evenly balanced partitions but has a shuffle cost
                    repartitionedRDD = mappedRDD.repartition(request.getLoaders());
                } else if (numLoaders < numExtractors) {
                    // Use coalesce() in this case. Shuffle tradeoff: turning shuffle on will give us evenly balanced partitions
                    // leading to an optimum write time but will incur network costs; shuffle off rids us of the network cost
                    // but might lead to sub-optimal write performance if the partitioning by the InputFormar was skewed in the
                    // first place
                    repartitionedRDD = mappedRDD.coalesce(request.getLoaders(), false);
                } else {
                    // Do not do any repartitioning/coalescing if loaders were specified but were equal to extractors
                    // Check if this statement incurs any cost
                    repartitionedRDD = mappedRDD;
                }
            }

            // For debugging - check size of RDD after partitioning; remove in production
            numPartitions = repartitionedRDD.partitions().size();

            // Calls the OutputFormat for writing
            //mappedRDD.saveAsNewAPIHadoopDataset(job.getConfiguration());
            repartitionedRDD.saveAsNewAPIHadoopDataset(job.getConfiguration());

            // Data transfer completed successfully if here
            request.getJobSubmission().setStatus(SubmissionStatus.SUCCEEDED);

            return true;

        } catch (Exception e) {
            SubmissionError error = new SubmissionError();
            error.setErrorSummary(e.toString());
            StringWriter writer = new StringWriter();
            e.printStackTrace(new PrintWriter(writer));
            writer.flush();
            error.setErrorDetails(writer.toString());

            request.getJobSubmission().setError(error);
            LOG.error("Error in submitting job", e);
            return false;
        }

    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void stop(String externalJobId) {

    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void update(MSubmission submission) {

    }
}