org.apache.sqoop.mapreduce.ImportJobBase.java Source code

Introduction

Here is the source code for org.apache.sqoop.mapreduce.ImportJobBase.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.sqoop.mapreduce;

import java.io.IOException;
import java.sql.SQLException;

import org.apache.avro.file.DataFileConstants;
import org.apache.avro.mapred.AvroJob;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.sqoop.mapreduce.hcat.SqoopHCatUtilities;
import org.apache.sqoop.util.PerfCounters;
import com.cloudera.sqoop.SqoopOptions;
import com.cloudera.sqoop.config.ConfigurationHelper;
import com.cloudera.sqoop.io.CodecMap;
import com.cloudera.sqoop.manager.ImportJobContext;
import com.cloudera.sqoop.mapreduce.JobBase;
import com.cloudera.sqoop.orm.TableClassName;
import com.cloudera.sqoop.util.ImportException;
import org.apache.sqoop.validation.*;

/**
 * Base class for running an import MapReduce job.
 * Allows dependency injection, etc, for easy customization of import job types.
 */
public class ImportJobBase extends JobBase {

    private ImportJobContext context;

    public static final Log LOG = LogFactory.getLog(ImportJobBase.class.getName());

    /** Controls how java.math.BigDecimal values should be converted to Strings
     *  If set to true (default) then will call toPlainString() method.
     *  If set to false then will call toString() method.
     */
    public static final String PROPERTY_BIGDECIMAL_FORMAT = "sqoop.bigdecimal.format.string";
    public static final boolean PROPERTY_BIGDECIMAL_FORMAT_DEFAULT = true;

    public ImportJobBase() {
        this(null);
    }

    public ImportJobBase(final SqoopOptions opts) {
        this(opts, null, null, null, null);
    }

    public ImportJobBase(final SqoopOptions opts, final Class<? extends Mapper> mapperClass,
            final Class<? extends InputFormat> inputFormatClass,
            final Class<? extends OutputFormat> outputFormatClass, final ImportJobContext context) {
        super(opts, mapperClass, inputFormatClass, outputFormatClass);
        this.context = context;
    }

    /**
     * Configure the output format to use for the job.
     */
    @Override
    protected void configureOutputFormat(Job job, String tableName, String tableClassName)
            throws ClassNotFoundException, IOException {

        job.setOutputFormatClass(getOutputFormatClass());

        if (isHCatJob) {
            LOG.debug("Configuring output format for HCatalog  import job");
            SqoopHCatUtilities.configureImportOutputFormat(options, job, getContext().getConnManager(), tableName,
                    job.getConfiguration());
            return;
        }

        if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
            job.getConfiguration().set("mapred.output.value.class", tableClassName);
        }

        if (options.shouldUseCompression()) {
            FileOutputFormat.setCompressOutput(job, true);

            String codecName = options.getCompressionCodec();
            Class<? extends CompressionCodec> codecClass;
            if (codecName == null) {
                codecClass = GzipCodec.class;
            } else {
                Configuration conf = job.getConfiguration();
                codecClass = CodecMap.getCodec(codecName, conf).getClass();
            }
            FileOutputFormat.setOutputCompressorClass(job, codecClass);

            if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
                SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
            }

            // SQOOP-428: Avro expects not a fully qualified class name but a "short"
            // name instead (e.g. "snappy") and it needs to be set in a custom
            // configuration option called "avro.output.codec".
            // The default codec is "deflate".
            if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
                if (codecName != null) {
                    String shortName = CodecMap.getCodecShortNameByName(codecName, job.getConfiguration());
                    // Avro only knows about "deflate" and not "default"
                    if (shortName.equalsIgnoreCase("default")) {
                        shortName = "deflate";
                    }
                    job.getConfiguration().set(AvroJob.OUTPUT_CODEC, shortName);
                } else {
                    job.getConfiguration().set(AvroJob.OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC);
                }
            }
        }

        Path outputPath = context.getDestination();
        FileOutputFormat.setOutputPath(job, outputPath);
    }

    /**
     * Actually run the MapReduce job.
     */
    @Override
    protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {

        PerfCounters perfCounters = new PerfCounters();
        perfCounters.startClock();

        boolean success = doSubmitJob(job);

        if (isHCatJob) {
            SqoopHCatUtilities.instance().invokeOutputCommitterForLocalMode(job);
        }

        perfCounters.stopClock();

        Counters jobCounters = job.getCounters();
        // If the job has been retired, these may be unavailable.
        if (null == jobCounters) {
            displayRetiredJobNotice(LOG);
        } else {
            perfCounters.addBytes(
                    jobCounters.getGroup("FileSystemCounters").findCounter("HDFS_BYTES_WRITTEN").getValue());
            LOG.info("Transferred " + perfCounters.toString());
            long numRecords = ConfigurationHelper.getNumMapOutputRecords(job);
            LOG.info("Retrieved " + numRecords + " records.");
        }
        return success;
    }

    /**
     * Submit the Map Reduce Job.
     */
    protected boolean doSubmitJob(Job job) throws IOException, InterruptedException, ClassNotFoundException {
        return job.waitForCompletion(true);
    }

    /**
     * Run an import job to read a table in to HDFS.
     *
     * @param tableName  the database table to read; may be null if a free-form
     * query is specified in the SqoopOptions, and the ImportJobBase subclass
     * supports free-form queries.
     * @param ormJarFile the Jar file to insert into the dcache classpath.
     * (may be null)
     * @param splitByCol the column of the database table to use to split
     * the import
     * @param conf A fresh Hadoop Configuration to use to build an MR job.
     * @throws IOException if the job encountered an IO problem
     * @throws ImportException if the job failed unexpectedly or was
     * misconfigured.
     */
    public void runImport(String tableName, String ormJarFile, String splitByCol, Configuration conf)
            throws IOException, ImportException {
        // Check if there are runtime error checks to do
        if (isHCatJob && options.isDirect() && !context.getConnManager().isDirectModeHCatSupported()) {
            throw new IOException("Direct import is not compatible with "
                    + "HCatalog operations using the connection manager "
                    + context.getConnManager().getClass().getName() + ". Please remove the parameter --direct");
        }

        if (null != tableName) {
            LOG.info("Beginning import of " + tableName);
        } else {
            LOG.info("Beginning query import.");
        }
        String tableClassName = null;
        if (!getContext().getConnManager().isORMFacilitySelfManaged()) {
            tableClassName = new TableClassName(options).getClassForTable(tableName);
        }
        // For ORM self managed, we leave the tableClassName to null so that
        // we don't check for non-existing classes.

        loadJars(conf, ormJarFile, tableClassName);

        Job job = createJob(conf);
        try {
            // Set the external jar to use for the job.
            job.getConfiguration().set("mapred.jar", ormJarFile);
            if (options.getMapreduceJobName() != null) {
                job.setJobName(options.getMapreduceJobName());
            }

            propagateOptionsToJob(job);
            configureInputFormat(job, tableName, tableClassName, splitByCol);
            configureOutputFormat(job, tableName, tableClassName);
            configureMapper(job, tableName, tableClassName);
            configureNumTasks(job);
            cacheJars(job, getContext().getConnManager());

            jobSetup(job);
            setJob(job);
            boolean success = runJob(job);
            if (!success) {
                throw new ImportException("Import job failed!");
            }

            completeImport(job);

            if (options.isValidationEnabled()) {
                validateImport(tableName, conf, job);
            }
        } catch (InterruptedException ie) {
            throw new IOException(ie);
        } catch (ClassNotFoundException cnfe) {
            throw new IOException(cnfe);
        } finally {
            unloadJars();
            jobTeardown(job);
        }
    }

    /**
     * Perform any operation that needs to be done post map/reduce job to
     * complete the import.
     */
    protected void completeImport(Job job) throws IOException, ImportException {
    }

    protected void validateImport(String tableName, Configuration conf, Job job) throws ImportException {
        LOG.debug("Validating imported data.");
        try {
            ValidationContext validationContext = new ValidationContext(
                    getRowCountFromDB(context.getConnManager(), tableName), // source
                    getRowCountFromHadoop(job)); // target

            doValidate(options, conf, validationContext);
        } catch (ValidationException e) {
            throw new ImportException("Error validating row counts", e);
        } catch (SQLException e) {
            throw new ImportException("Error retrieving DB source row count", e);
        } catch (IOException e) {
            throw new ImportException("Error retrieving target row count", e);
        } catch (InterruptedException e) {
            throw new ImportException("Error retrieving target row count", e);
        }
    }

    /**
     * Open-ended "setup" routine that is called after the job is configured
     * but just before it is submitted to MapReduce. Subclasses may override
     * if necessary.
     */
    protected void jobSetup(Job job) throws IOException, ImportException {
    }

    /**
     * Open-ended "teardown" routine that is called after the job is executed.
     * Subclasses may override if necessary.
     */
    protected void jobTeardown(Job job) throws IOException, ImportException {
    }

    protected ImportJobContext getContext() {
        return context;
    }
}