org.apache.kylin.engine.mr.common.AbstractHadoopJob.java Source code

Introduction

Here is the source code for org.apache.kylin.engine.mr.common.AbstractHadoopJob.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.engine.mr.common;

/**
 * @author George Song (ysong1)
 *
 */

import static org.apache.hadoop.util.StringUtils.formatTime;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.ClassUtil;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.persistence.RawResource;
import org.apache.kylin.common.persistence.ResourceStore;
import org.apache.kylin.common.util.CliCommandExecutor;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.OptionsHelper;
import org.apache.kylin.common.util.StringSplitter;
import org.apache.kylin.common.util.StringUtil;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.job.JobInstance;
import org.apache.kylin.job.exception.JobException;
import org.apache.kylin.metadata.model.TableDesc;
import org.apache.kylin.metadata.model.TableRef;
import org.apache.kylin.source.SourceFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@SuppressWarnings("static-access")
public abstract class AbstractHadoopJob extends Configured implements Tool {
    private static final Logger logger = LoggerFactory.getLogger(AbstractHadoopJob.class);

    protected static final Option OPTION_JOB_NAME = OptionBuilder.withArgName(BatchConstants.ARG_JOB_NAME).hasArg()
            .isRequired(true).withDescription("Job name. For example, Kylin_Cuboid_Builder-clsfd_v2_Step_22-D)")
            .create(BatchConstants.ARG_JOB_NAME);
    protected static final Option OPTION_CUBE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_CUBE_NAME)
            .hasArg().isRequired(true).withDescription("Cube name. For exmaple, flat_item_cube")
            .create(BatchConstants.ARG_CUBE_NAME);
    protected static final Option OPTION_CUBING_JOB_ID = OptionBuilder.withArgName(BatchConstants.ARG_CUBING_JOB_ID)
            .hasArg().isRequired(false).withDescription("ID of cubing job executable")
            .create(BatchConstants.ARG_CUBING_JOB_ID);
    //    @Deprecated
    protected static final Option OPTION_SEGMENT_NAME = OptionBuilder.withArgName(BatchConstants.ARG_SEGMENT_NAME)
            .hasArg().isRequired(true).withDescription("Cube segment name").create(BatchConstants.ARG_SEGMENT_NAME);
    protected static final Option OPTION_SEGMENT_ID = OptionBuilder.withArgName(BatchConstants.ARG_SEGMENT_ID)
            .hasArg().isRequired(true).withDescription("Cube segment id").create(BatchConstants.ARG_SEGMENT_ID);
    protected static final Option OPTION_INPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_INPUT).hasArg()
            .isRequired(true).withDescription("Input path").create(BatchConstants.ARG_INPUT);
    protected static final Option OPTION_INPUT_FORMAT = OptionBuilder.withArgName(BatchConstants.ARG_INPUT_FORMAT)
            .hasArg().isRequired(false).withDescription("Input format").create(BatchConstants.ARG_INPUT_FORMAT);
    protected static final Option OPTION_OUTPUT_PATH = OptionBuilder.withArgName(BatchConstants.ARG_OUTPUT).hasArg()
            .isRequired(true).withDescription("Output path").create(BatchConstants.ARG_OUTPUT);
    protected static final Option OPTION_NCUBOID_LEVEL = OptionBuilder.withArgName(BatchConstants.ARG_LEVEL)
            .hasArg().isRequired(true).withDescription("N-Cuboid build level, e.g. 1, 2, 3...")
            .create(BatchConstants.ARG_LEVEL);
    protected static final Option OPTION_PARTITION_FILE_PATH = OptionBuilder
            .withArgName(BatchConstants.ARG_PARTITION).hasArg().isRequired(true)
            .withDescription("Partition file path.").create(BatchConstants.ARG_PARTITION);
    protected static final Option OPTION_HTABLE_NAME = OptionBuilder.withArgName(BatchConstants.ARG_HTABLE_NAME)
            .hasArg().isRequired(true).withDescription("HTable name").create(BatchConstants.ARG_HTABLE_NAME);

    protected static final Option OPTION_STATISTICS_ENABLED = OptionBuilder
            .withArgName(BatchConstants.ARG_STATS_ENABLED).hasArg().isRequired(false)
            .withDescription("Statistics enabled").create(BatchConstants.ARG_STATS_ENABLED);
    protected static final Option OPTION_STATISTICS_OUTPUT = OptionBuilder
            .withArgName(BatchConstants.ARG_STATS_OUTPUT).hasArg().isRequired(false)
            .withDescription("Statistics output").create(BatchConstants.ARG_STATS_OUTPUT);
    protected static final Option OPTION_STATISTICS_SAMPLING_PERCENT = OptionBuilder
            .withArgName(BatchConstants.ARG_STATS_SAMPLING_PERCENT).hasArg().isRequired(false)
            .withDescription("Statistics sampling percentage").create(BatchConstants.ARG_STATS_SAMPLING_PERCENT);

    private static final String MAP_REDUCE_CLASSPATH = "mapreduce.application.classpath";

    protected static void runJob(Tool job, String[] args) {
        try {
            int exitCode = ToolRunner.run(job, args);
            System.exit(exitCode);
        } catch (Exception e) {
            e.printStackTrace(System.err);
            System.exit(5);
        }
    }

    // ============================================================================

    protected String name;
    protected boolean isAsync = false;
    protected OptionsHelper optionsHelper = new OptionsHelper();

    protected Job job;

    public AbstractHadoopJob() {
        super(HadoopUtil.getCurrentConfiguration());
    }

    protected void parseOptions(Options options, String[] args) throws ParseException {
        optionsHelper.parseOptions(options, args);
    }

    public void printUsage(Options options) {
        optionsHelper.printUsage(getClass().getSimpleName(), options);
    }

    public Option[] getOptions() {
        return optionsHelper.getOptions();
    }

    public String getOptionsAsString() {
        return optionsHelper.getOptionsAsString();
    }

    protected String getOptionValue(Option option) {
        return optionsHelper.getOptionValue(option);
    }

    protected boolean hasOption(Option option) {
        return optionsHelper.hasOption(option);
    }

    protected int waitForCompletion(Job job) throws IOException, InterruptedException, ClassNotFoundException {
        int retVal = 0;
        long start = System.nanoTime();
        if (isAsync) {
            job.submit();
        } else {
            job.waitForCompletion(true);
            retVal = job.isSuccessful() ? 0 : 1;
            logger.debug("Job '" + job.getJobName() + "' finished "
                    + (job.isSuccessful() ? "successfully in " : "with failures.  Time taken ")
                    + formatTime((System.nanoTime() - start) / 1000000L));
        }
        return retVal;
    }

    protected void setJobClasspath(Job job, KylinConfig kylinConf) {
        String jarPath = kylinConf.getKylinJobJarPath();
        File jarFile = new File(jarPath);
        if (jarFile.exists()) {
            job.setJar(jarPath);
            logger.info("append job jar: " + jarPath);
        } else {
            job.setJarByClass(this.getClass());
        }

        String kylinHiveDependency = System.getProperty("kylin.hive.dependency");
        String kylinHBaseDependency = System.getProperty("kylin.hbase.dependency");
        String kylinKafkaDependency = System.getProperty("kylin.kafka.dependency");
        logger.info("append kylin.hbase.dependency: " + kylinHBaseDependency + " to " + MAP_REDUCE_CLASSPATH);

        Configuration jobConf = job.getConfiguration();
        String classpath = jobConf.get(MAP_REDUCE_CLASSPATH);
        if (classpath == null || classpath.length() == 0) {
            logger.info("Didn't find " + MAP_REDUCE_CLASSPATH
                    + " in job configuration, will run 'mapred classpath' to get the default value.");
            classpath = getDefaultMapRedClasspath();
            logger.info("The default mapred classpath is: " + classpath);
        }

        if (kylinHBaseDependency != null) {
            // yarn classpath is comma separated
            kylinHBaseDependency = kylinHBaseDependency.replace(":", ",");
            classpath = classpath + "," + kylinHBaseDependency;
        }

        jobConf.set(MAP_REDUCE_CLASSPATH, classpath);
        logger.info("Hadoop job classpath is: " + job.getConfiguration().get(MAP_REDUCE_CLASSPATH));

        /*
         *  set extra dependencies as tmpjars & tmpfiles if configured
         */
        StringBuilder kylinDependency = new StringBuilder();

        // for hive dependencies
        if (kylinHiveDependency != null) {
            // yarn classpath is comma separated
            kylinHiveDependency = kylinHiveDependency.replace(":", ",");

            logger.info("Hive Dependencies Before Filtered: " + kylinHiveDependency);
            String filteredHive = filterKylinHiveDependency(kylinHiveDependency, kylinConf);
            logger.info("Hive Dependencies After Filtered: " + filteredHive);

            StringUtil.appendWithSeparator(kylinDependency, filteredHive);
        } else {

            logger.info("No hive dependency jars set in the environment, will find them from classpath:");

            try {
                String hiveExecJarPath = ClassUtil
                        .findContainingJar(Class.forName("org.apache.hadoop.hive.ql.Driver"));

                StringUtil.appendWithSeparator(kylinDependency, hiveExecJarPath);
                logger.info("hive-exec jar file: " + hiveExecJarPath);

                String hiveHCatJarPath = ClassUtil
                        .findContainingJar(Class.forName("org.apache.hive.hcatalog.mapreduce.HCatInputFormat"));
                StringUtil.appendWithSeparator(kylinDependency, hiveHCatJarPath);
                logger.info("hive-catalog jar file: " + hiveHCatJarPath);

                String hiveMetaStoreJarPath = ClassUtil
                        .findContainingJar(Class.forName("org.apache.hadoop.hive.metastore.api.Table"));
                StringUtil.appendWithSeparator(kylinDependency, hiveMetaStoreJarPath);
                logger.info("hive-metastore jar file: " + hiveMetaStoreJarPath);
            } catch (ClassNotFoundException e) {
                logger.error("Cannot found hive dependency jars: " + e);
            }
        }

        // for kafka dependencies
        if (kylinKafkaDependency != null) {
            kylinKafkaDependency = kylinKafkaDependency.replace(":", ",");
            logger.info("Kafka Dependencies: " + kylinKafkaDependency);
            StringUtil.appendWithSeparator(kylinDependency, kylinKafkaDependency);
        } else {
            logger.info("No Kafka dependency jar set in the environment, will find them from classpath:");
            try {
                String kafkaClientJarPath = ClassUtil
                        .findContainingJar(Class.forName("org.apache.kafka.clients.consumer.KafkaConsumer"));
                StringUtil.appendWithSeparator(kylinDependency, kafkaClientJarPath);
                logger.info("kafka jar file: " + kafkaClientJarPath);

            } catch (ClassNotFoundException e) {
                logger.debug("Not found kafka client jar from classpath, it is optional for normal build: " + e);
            }
        }

        // for KylinJobMRLibDir
        String mrLibDir = kylinConf.getKylinJobMRLibDir();
        StringUtil.appendWithSeparator(kylinDependency, mrLibDir);

        setJobTmpJarsAndFiles(job, kylinDependency.toString());

        overrideJobConfig(job.getConfiguration(), kylinConf.getMRConfigOverride());
    }

    private void overrideJobConfig(Configuration jobConf, Map<String, String> override) {
        for (Entry<String, String> entry : override.entrySet()) {
            jobConf.set(entry.getKey(), entry.getValue());
        }
    }

    private String filterKylinHiveDependency(String kylinHiveDependency, KylinConfig config) {
        if (StringUtils.isBlank(kylinHiveDependency))
            return "";

        StringBuilder jarList = new StringBuilder();

        Pattern hivePattern = Pattern.compile(config.getHiveDependencyFilterList());
        Matcher matcher = hivePattern.matcher(kylinHiveDependency);

        while (matcher.find()) {
            if (jarList.length() > 0)
                jarList.append(",");
            jarList.append(matcher.group());
        }

        return jarList.toString();
    }

    private void setJobTmpJarsAndFiles(Job job, String kylinDependency) {
        if (StringUtils.isBlank(kylinDependency))
            return;

        String[] fNameList = kylinDependency.split(",");

        try {
            Configuration jobConf = job.getConfiguration();
            FileSystem localfs = FileSystem.getLocal(jobConf);
            FileSystem hdfs = HadoopUtil.getWorkingFileSystem(jobConf);

            StringBuilder jarList = new StringBuilder();
            StringBuilder fileList = new StringBuilder();

            for (String fileName : fNameList) {
                Path p = new Path(fileName);
                if (p.isAbsolute() == false) {
                    logger.warn("The directory of kylin dependency '" + fileName + "' is not absolute, skip");
                    continue;
                }
                FileSystem fs;
                if (exists(hdfs, p)) {
                    fs = hdfs;
                } else if (exists(localfs, p)) {
                    fs = localfs;
                } else {
                    logger.warn("The directory of kylin dependency '" + fileName + "' does not exist, skip");
                    continue;
                }

                if (fs.getFileStatus(p).isDirectory()) {
                    appendTmpDir(job, fs, p, jarList, fileList);
                    continue;
                }

                StringBuilder list = (p.getName().endsWith(".jar")) ? jarList : fileList;
                if (list.length() > 0)
                    list.append(",");
                list.append(fs.getFileStatus(p).getPath());
            }

            appendTmpFiles(fileList.toString(), jobConf);
            appendTmpJars(jarList.toString(), jobConf);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void appendTmpDir(Job job, FileSystem fs, Path tmpDir, StringBuilder jarList, StringBuilder fileList) {
        try {
            FileStatus[] fList = fs.listStatus(tmpDir);

            for (FileStatus file : fList) {
                Path p = file.getPath();
                if (fs.getFileStatus(p).isDirectory()) {
                    appendTmpDir(job, fs, p, jarList, fileList);
                    continue;
                }

                StringBuilder list = (p.getName().endsWith(".jar")) ? jarList : fileList;
                if (list.length() > 0)
                    list.append(",");
                list.append(fs.getFileStatus(p).getPath().toString());
            }

        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void appendTmpJars(String jarList, Configuration conf) {
        if (StringUtils.isBlank(jarList))
            return;

        String tmpJars = conf.get("tmpjars", null);
        if (tmpJars == null) {
            tmpJars = jarList;
        } else {
            tmpJars += "," + jarList;
        }
        conf.set("tmpjars", tmpJars);
        logger.info("Job 'tmpjars' updated -- " + tmpJars);
    }

    private void appendTmpFiles(String fileList, Configuration conf) {
        if (StringUtils.isBlank(fileList))
            return;

        String tmpFiles = conf.get("tmpfiles", null);
        if (tmpFiles == null) {
            tmpFiles = fileList;
        } else {
            tmpFiles += "," + fileList;
        }
        conf.set("tmpfiles", tmpFiles);
        logger.info("Job 'tmpfiles' updated -- " + tmpFiles);
    }

    private String getDefaultMapRedClasspath() {

        String classpath = "";
        try {
            CliCommandExecutor executor = KylinConfig.getInstanceFromEnv().getCliCommandExecutor();
            String output = executor.execute("mapred classpath").getSecond();
            classpath = output.trim().replace(':', ',');
        } catch (IOException e) {
            logger.error("Failed to run: 'mapred classpath'.", e);
        }

        return classpath;
    }

    private static boolean exists(FileSystem fs, Path p) throws IOException {
        try {
            return fs.exists(p);
        } catch (IllegalArgumentException ex) {
            // can happen when FS mismatch
            return false;
        }
    }

    public static int addInputDirs(String input, Job job) throws IOException {
        int folderNum = addInputDirs(StringSplitter.split(input, ","), job);
        logger.info("Number of added folders:" + folderNum);
        return folderNum;
    }

    public static int addInputDirs(String[] inputs, Job job) throws IOException {
        int ret = 0;//return number of added folders
        for (String inp : inputs) {
            inp = inp.trim();
            if (inp.endsWith("/*")) {
                inp = inp.substring(0, inp.length() - 2);
                FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
                Path path = new Path(inp);

                if (!exists(fs, path)) {
                    logger.warn("Path not exist:" + path.toString());
                    continue;
                }

                FileStatus[] fileStatuses = fs.listStatus(path);
                boolean hasDir = false;
                for (FileStatus stat : fileStatuses) {
                    if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                        hasDir = true;
                        ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                    }
                }
                if (fileStatuses.length > 0 && !hasDir) {
                    ret += addInputDirs(new String[] { path.toString() }, job);
                }
            } else {
                logger.debug("Add input " + inp);
                FileInputFormat.addInputPath(job, new Path(inp));
                ret++;
            }
        }
        return ret;
    }

    public static KylinConfig loadKylinPropsAndMetadata() throws IOException {
        File metaDir = new File("meta");
        if (!metaDir.getAbsolutePath().equals(System.getProperty(KylinConfig.KYLIN_CONF))) {
            System.setProperty(KylinConfig.KYLIN_CONF, metaDir.getAbsolutePath());
            logger.info("The absolute path for meta dir is " + metaDir.getAbsolutePath());
            KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
            kylinConfig.setMetadataUrl(metaDir.getAbsolutePath());
            return kylinConfig;
        } else {
            return KylinConfig.getInstanceFromEnv();
        }
    }

    protected void attachTableMetadata(TableDesc table, Configuration conf) throws IOException {
        Set<String> dumpList = new LinkedHashSet<>();
        dumpList.add(table.getResourcePath());
        dumpKylinPropsAndMetadata(dumpList, KylinConfig.getInstanceFromEnv(), conf);
    }

    protected void attachCubeMetadata(CubeInstance cube, Configuration conf) throws IOException {
        dumpKylinPropsAndMetadata(collectCubeMetadata(cube), cube.getConfig(), conf);
    }

    protected void attachCubeMetadataWithDict(CubeInstance cube, Configuration conf) throws IOException {
        Set<String> dumpList = new LinkedHashSet<>();
        dumpList.addAll(collectCubeMetadata(cube));
        for (CubeSegment segment : cube.getSegments()) {
            dumpList.addAll(segment.getDictionaryPaths());
        }
        dumpKylinPropsAndMetadata(dumpList, cube.getConfig(), conf);
    }

    protected void attachSegmentMetadataWithDict(CubeSegment segment, Configuration conf) throws IOException {
        Set<String> dumpList = new LinkedHashSet<>();
        dumpList.addAll(collectCubeMetadata(segment.getCubeInstance()));
        dumpList.addAll(segment.getDictionaryPaths());
        dumpKylinPropsAndMetadata(dumpList, segment.getConfig(), conf);
    }

    private Set<String> collectCubeMetadata(CubeInstance cube) {
        // cube, model_desc, cube_desc, table
        Set<String> dumpList = new LinkedHashSet<>();
        dumpList.add(cube.getResourcePath());
        dumpList.add(cube.getDescriptor().getModel().getResourcePath());
        dumpList.add(cube.getDescriptor().getResourcePath());

        for (TableRef tableRef : cube.getDescriptor().getModel().getAllTables()) {
            TableDesc table = tableRef.getTableDesc();
            dumpList.add(table.getResourcePath());
            dumpList.addAll(SourceFactory.getMRDependentResources(table));
        }

        return dumpList;
    }

    protected void dumpKylinPropsAndMetadata(Set<String> dumpList, KylinConfig kylinConfig, Configuration conf)
            throws IOException {
        File tmp = File.createTempFile("kylin_job_meta", "");
        FileUtils.forceDelete(tmp); // we need a directory, so delete the file first

        File metaDir = new File(tmp, "meta");
        metaDir.mkdirs();

        // write kylin.properties
        File kylinPropsFile = new File(metaDir, "kylin.properties");
        kylinConfig.writeProperties(kylinPropsFile);

        // write resources
        dumpResources(kylinConfig, metaDir, dumpList);

        // hadoop distributed cache
        String hdfsMetaDir = OptionsHelper.convertToFileURL(metaDir.getAbsolutePath());
        if (hdfsMetaDir.startsWith("/")) // note Path on windows is like "d:/../..."
            hdfsMetaDir = "file://" + hdfsMetaDir;
        else
            hdfsMetaDir = "file:///" + hdfsMetaDir;
        logger.info("HDFS meta dir is: " + hdfsMetaDir);

        appendTmpFiles(hdfsMetaDir, conf);
    }

    protected void cleanupTempConfFile(Configuration conf) {
        String tempMetaFileString = conf.get("tmpfiles");
        logger.info("tempMetaFileString is : " + tempMetaFileString);
        if (tempMetaFileString != null) {
            if (tempMetaFileString.startsWith("file://")) {
                tempMetaFileString = tempMetaFileString.substring("file://".length());
                File tempMetaFile = new File(tempMetaFileString);
                if (tempMetaFile.exists()) {
                    try {
                        FileUtils.forceDelete(tempMetaFile.getParentFile());

                    } catch (IOException e) {
                        logger.warn("error when deleting " + tempMetaFile, e);
                    }
                } else {
                    logger.info("" + tempMetaFileString + " does not exist");
                }
            } else {
                logger.info("tempMetaFileString is not starting with file:// :" + tempMetaFileString);
            }
        }
    }

    private void dumpResources(KylinConfig kylinConfig, File metaDir, Set<String> dumpList) throws IOException {
        long startTime = System.currentTimeMillis();

        ResourceStore from = ResourceStore.getStore(kylinConfig);
        KylinConfig localConfig = KylinConfig.createInstanceFromUri(metaDir.getAbsolutePath());
        ResourceStore to = ResourceStore.getStore(localConfig);
        for (String path : dumpList) {
            RawResource res = from.getResource(path);
            if (res == null)
                throw new IllegalStateException("No resource found at -- " + path);
            to.putResource(path, res.inputStream, res.timestamp);
            res.inputStream.close();
        }

        logger.debug("Dump resources to {} took {} ms", metaDir, System.currentTimeMillis() - startTime);
    }

    protected void deletePath(Configuration conf, Path path) throws IOException {
        HadoopUtil.deletePath(conf, path);
    }

    public static double getTotalMapInputMB(Job job)
            throws ClassNotFoundException, IOException, InterruptedException, JobException {
        if (job == null) {
            throw new JobException("Job is null");
        }

        long mapInputBytes = 0;
        InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
        for (InputSplit split : input.getSplits(job)) {
            mapInputBytes += split.getLength();
        }
        if (mapInputBytes == 0) {
            throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
        }
        double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
        return totalMapInputMB;
    }

    protected double getTotalMapInputMB()
            throws ClassNotFoundException, IOException, InterruptedException, JobException {
        return getTotalMapInputMB(job);
    }

    protected int getMapInputSplitCount()
            throws ClassNotFoundException, JobException, IOException, InterruptedException {
        if (job == null) {
            throw new JobException("Job is null");
        }
        InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
        return input.getSplits(job).size();
    }

    public void kill() throws JobException {
        if (job != null) {
            try {
                job.killJob();
            } catch (IOException e) {
                throw new JobException(e);
            }
        }
    }

    public Map<String, String> getInfo() throws JobException {
        if (job != null) {
            Map<String, String> status = new HashMap<String, String>();
            if (null != job.getJobID()) {
                status.put(JobInstance.MR_JOB_ID, job.getJobID().toString());
            }
            if (null != job.getTrackingURL()) {
                status.put(JobInstance.YARN_APP_URL, job.getTrackingURL().toString());
            }

            return status;
        } else {
            throw new JobException("Job is null");
        }
    }

    public Counters getCounters() throws JobException {
        if (job != null) {
            try {
                return job.getCounters();
            } catch (IOException e) {
                throw new JobException(e);
            }
        } else {
            throw new JobException("Job is null");
        }
    }

    public void setAsync(boolean isAsync) {
        this.isAsync = isAsync;
    }

    public Job getJob() {
        return this.job;
    }

    // tells MapReduceExecutable to skip this job
    public boolean isSkipped() {
        return false;
    }

}