com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java Source code

Introduction

Here is the source code for com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
Source

/**
 * Copyright 2011-2016 Asakusa Framework Team.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.asakusafw.runtime.mapreduce.simple;

import static com.asakusafw.runtime.compatibility.JobCompatibility.*;

import java.io.File;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.JobStatus.State;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.util.Progress;
import org.apache.hadoop.util.ReflectionUtils;

import com.asakusafw.runtime.stage.JobRunner;

/**
 * An implementation of {@link JobRunner} using simplified map-reduce engine.
 * @since 0.7.1
 */
public class SimpleJobRunner implements JobRunner {

    static final Log LOG = LogFactory.getLog(SimpleJobRunner.class);

    private static final String KEY_PREFIX = "com.asakusafw.mapreduce."; //$NON-NLS-1$

    /**
     * Hadoop property key of shuffle buffer size.
     */
    public static final String KEY_BUFFER_SIZE = KEY_PREFIX + "shuffle.buffer"; //$NON-NLS-1$

    /**
     * Hadoop property key of shuffle temporary directory.
     */
    public static final String KEY_TEMPORARY_LOCATION = KEY_PREFIX + "shuffle.tempdir"; //$NON-NLS-1$

    /**
     * Hadoop property key of whether block file compression is enabled or not.
     */
    public static final String KEY_COMPRESS_BLOCK = KEY_PREFIX + "shuffle.compress"; //$NON-NLS-1$

    private static final int DEFAULT_BUFFER_SIZE = 64 * 1024 * 1024;

    private static final int MIN_BUFFER_SIZE = 2 * 1024 * 1024;

    private static final int MAX_BUFFER_SIZE = Integer.MAX_VALUE;

    private static final boolean DEFAULT_COMPRESS_BLOCK = false;

    @Override
    public boolean run(Job job) throws InterruptedException {
        JobID jobId = newJobId(new Random().nextInt(Integer.MAX_VALUE));
        setJobId(job, jobId);
        LOG.info(MessageFormat.format("starting job using {0}: {1} ({2})", this, job.getJobID(), job.getJobName()));
        try {
            runJob(job);
            return true;
        } catch (InterruptedException e) {
            throw e;
        } catch (Exception e) {
            LOG.error(MessageFormat.format("exception was occurred while executing job: {0} ({1})", job.getJobID(),
                    job.getJobName()), e);
            return false;
        }
    }

    private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
        assert job.getJobID() != null;
        TaskID taskId = newMapTaskId(job.getJobID(), 0);
        Configuration conf = job.getConfiguration();
        OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
        OutputCommitter committer = output
                .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0)));
        boolean succeed = false;
        committer.setupJob(job);
        try {
            if (job.getNumReduceTasks() == 0) {
                runMap(job, null);
            } else {
                try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(),
                        job.getMapOutputValueClass())) {
                    runMap(job, sorter);
                    runReduce(job, sorter);
                }
            }
            committer.commitJob(job);
            succeed = true;
        } finally {
            if (succeed == false) {
                try {
                    committer.abortJob(job, State.FAILED);
                } catch (IOException e) {
                    LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(),
                            job.getJobName()), e);
                }
            }
        }
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private void runMap(Job job, KeyValueSorter<?, ?> sorter)
            throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = job.getConfiguration();
        InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
        List<InputSplit> splits = input.getSplits(job);
        int serial = 1;
        for (InputSplit split : splits) {
            TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0);
            Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$
                        mapper.getClass().getName(), id, split.getLength()));
            }
            TaskAttemptContext context = newTaskAttemptContext(conf, id);
            // we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached
            OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
            OutputCommitter committer = output.getOutputCommitter(context);
            committer.setupTask(context);
            boolean succeed = false;
            try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) {
                RecordWriter<?, ?> writer;
                if (sorter != null) {
                    writer = new ShuffleWriter(sorter);
                } else {
                    writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
                }
                try {
                    Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split);
                    reader.initialize(split, c);
                    mapper.run(c);
                } finally {
                    writer.close(newTaskAttemptContext(conf, id));
                }
                doCommitTask(context, committer);
                succeed = true;
            } finally {
                if (succeed == false) {
                    doAbortTask(context, committer);
                }
            }
        }
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void runReduce(Job job, KeyValueSorter<?, ?> sorter)
            throws ClassNotFoundException, IOException, InterruptedException {
        Configuration conf = job.getConfiguration();
        OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
        TaskAttemptID id = newTaskAttemptId(newReduceTaskId(job.getJobID(), 1), 0);
        Reducer<?, ?, ?, ?> reducer = ReflectionUtils.newInstance(job.getReducerClass(), conf);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("starting reducer: {0}@{1} ({2}records, {3}bytes)", //$NON-NLS-1$
                    reducer.getClass().getName(), id, sorter.getRecordCount(), sorter.getSizeInBytes()));
        }
        TaskAttemptContext context = newTaskAttemptContext(conf, id);
        OutputCommitter committer = output.getOutputCommitter(context);
        committer.setupTask(context);
        boolean succeed = false;
        try {
            ShuffleReader reader = new ShuffleReader(sorter, new Progress());
            try {
                RecordWriter<?, ?> writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
                try {
                    Reducer.Context c = newReducerContext(conf, id, reader, sorter.getKeyClass(),
                            sorter.getValueClass(), writer, committer, (RawComparator) job.getGroupingComparator());
                    reducer.run(c);
                } finally {
                    writer.close(newTaskAttemptContext(conf, id));
                }
            } finally {
                try {
                    reader.close();
                } catch (IOException e) {
                    LOG.warn(MessageFormat.format("error occurred while reducer mapper input: {0} ({1})", id,
                            job.getJobName()), e);
                }
            }
            doCommitTask(context, committer);
            succeed = true;
        } finally {
            if (succeed == false) {
                doAbortTask(context, committer);
            }
        }
    }

    private void doCommitTask(TaskAttemptContext context, OutputCommitter committer) throws IOException {
        if (committer.needsTaskCommit(context)) {
            committer.commitTask(context);
        }
    }

    private void doAbortTask(TaskAttemptContext context, OutputCommitter committer) {
        try {
            committer.abortTask(context);
        } catch (IOException e) {
            LOG.error(MessageFormat.format("error occurred while aborting task: {0} ({1})",
                    context.getTaskAttemptID(), context.getJobName()), e);
        }
    }

    private <K, V> KeyValueSorter<?, ?> createSorter(Job job, Class<K> key, Class<V> value) {
        KeyValueSorter.Options options = getSorterOptions(job.getConfiguration());
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format(
                    "shuffle buffer size: {1}bytes/page, {2}bytes/block, compression:{3} ({0})", //$NON-NLS-1$
                    job.getJobName(), options.getPageSize(), options.getBlockSize(), options.isCompressBlock()));
        }
        return new KeyValueSorter<>(new SerializationFactory(job.getConfiguration()), key, value,
                job.getSortComparator(), options);
    }

    private KeyValueSorter.Options getSorterOptions(Configuration configuration) {
        long bufferSize = configuration.getLong(KEY_BUFFER_SIZE, -1);
        if (bufferSize < 0) {
            bufferSize = DEFAULT_BUFFER_SIZE;
        } else {
            bufferSize = Math.max(MIN_BUFFER_SIZE, Math.min(MAX_BUFFER_SIZE, bufferSize));
        }
        File temporaryDirectory = null;
        String tempdirString = configuration.get(KEY_TEMPORARY_LOCATION);
        if (tempdirString != null) {
            temporaryDirectory = new File(tempdirString);
            if (temporaryDirectory.mkdirs() == false && temporaryDirectory.isDirectory() == false) {
                LOG.warn(MessageFormat.format("failed to prepare shuffle temporary directory: {0}={1}",
                        KEY_TEMPORARY_LOCATION, temporaryDirectory));
            }
        }
        boolean compress = configuration.getBoolean(KEY_COMPRESS_BLOCK, DEFAULT_COMPRESS_BLOCK);
        KeyValueSorter.Options options = new KeyValueSorter.Options().withBufferSize((int) bufferSize)
                .withTemporaryDirectory(temporaryDirectory).withCompressBlock(compress);
        return options;
    }

    @Override
    public String toString() {
        return "Asakusa built-in job runner";
    }
}