com.asakusafw.runtime.stage.AbstractStageClient.java Source code

Java tutorial

Introduction

Here is the source code for com.asakusafw.runtime.stage.AbstractStageClient.java

Source

/**
 * Copyright 2011-2017 Asakusa Framework Team.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.asakusafw.runtime.stage;

import static com.asakusafw.runtime.stage.StageConstants.*;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.ServiceLoader;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ReflectionUtils;

import com.asakusafw.runtime.core.context.RuntimeContext;
import com.asakusafw.runtime.stage.input.StageInputDriver;
import com.asakusafw.runtime.stage.input.StageInputFormat;
import com.asakusafw.runtime.stage.input.StageInputMapper;
import com.asakusafw.runtime.stage.output.LegacyBridgeOutputCommitter;
import com.asakusafw.runtime.stage.output.StageOutputDriver;
import com.asakusafw.runtime.stage.output.StageOutputFormat;
import com.asakusafw.runtime.stage.resource.StageResourceDriver;
import com.asakusafw.runtime.util.VariableTable;
import com.asakusafw.runtime.util.VariableTable.RedefineStrategy;

/**
 * An abstract implementation of Hadoop MapReduce stage client class.
 * @since 0.1.0
 * @version 0.7.1
 */
public abstract class AbstractStageClient extends BaseStageClient {

    /**
     * The method name of {@link #getStageOutputPath()}.
     */
    public static final String METHOD_STAGE_OUTPUT_PATH = "getStageOutputPath"; //$NON-NLS-1$

    /**
     * The method name of {@link #getStageInputs()}.
     */
    public static final String METHOD_STAGE_INPUTS = "getStageInputs"; //$NON-NLS-1$

    /**
     * The method name of {@link #getStageOutputs()}.
     */
    public static final String METHOD_STAGE_OUTPUTS = "getStageOutputs"; //$NON-NLS-1$

    /**
     * The method name of {@link #getStageResources()}.
     */
    public static final String METHOD_STAGE_RESOURCES = "getStageResources"; //$NON-NLS-1$

    /**
     * The method name of {@link #getShuffleKeyClassOrNull()}.
     */
    public static final String METHOD_SHUFFLE_KEY_CLASS = "getShuffleKeyClassOrNull"; //$NON-NLS-1$

    /**
     * The method name of {@link #getShuffleValueClassOrNull()}.
     */
    public static final String METHOD_SHUFFLE_VALUE_CLASS = "getShuffleValueClassOrNull"; //$NON-NLS-1$

    /**
     * The method name of {@link #getPartitionerClassOrNull()}.
     */
    public static final String METHOD_PARTITIONER_CLASS = "getPartitionerClassOrNull"; //$NON-NLS-1$

    /**
     * The method name of {@link #getCombinerClassOrNull()}.
     */
    public static final String METHOD_COMBINER_CLASS = "getCombinerClassOrNull"; //$NON-NLS-1$

    /**
     * The method name of {@link #getSortComparatorClassOrNull()}.
     */
    public static final String METHOD_SORT_COMPARATOR_CLASS = "getSortComparatorClassOrNull"; //$NON-NLS-1$

    /**
     * The method name of {@link #getGroupingComparatorClassOrNull()}.
     */
    public static final String METHOD_GROUPING_COMPARATOR_CLASS = "getGroupingComparatorClassOrNull"; //$NON-NLS-1$

    /**
     * The method name of {@link #getReducerClassOrNull()}.
     */
    public static final String METHOD_REDUCER_CLASS = "getReducerClassOrNull"; //$NON-NLS-1$

    static final Log LOG = LogFactory.getLog(AbstractStageClient.class);

    /**
     * Configures the {@link Job} object for this stage.
     * @param job the target job
     * @param variables current variable table
     * @throws IOException if failed to configure the job
     * @throws InterruptedException if interrupted while configuring {@link Job} object
     */
    protected void configureStage(Job job, VariableTable variables) throws IOException, InterruptedException {
        ClassLoader loader = job.getConfiguration().getClassLoader();
        for (StageConfigurator configurator : ServiceLoader.load(StageConfigurator.class, loader)) {
            configurator.configure(job);
        }
    }

    /**
     * Returns the stage inputs.
     * @return the stage inputs
     * @see StageInputDriver
     */
    protected abstract List<StageInput> getStageInputs();

    /**
     * Returns the base path of stage output.
     * @return the base path
     */
    protected abstract String getStageOutputPath();

    /**
     * Returns the stage outputs.
     * @return the stage outputs
     * @see StageOutputDriver
     */
    protected List<StageOutput> getStageOutputs() {
        return Collections.emptyList();
    }

    /**
     * Returns the stage resources.
     * @return the stage resources
     * @see StageResourceDriver
     */
    protected List<StageResource> getStageResources() {
        return Collections.emptyList();
    }

    /**
     * Returns the shuffle key class.
     * @return the shuffle key class, or {@code null} if a reduce operation is not required in this stage
     */
    protected Class<? extends Writable> getShuffleKeyClassOrNull() {
        return null;
    }

    /**
     * Returns the shuffle value class.
     * @return the shuffle value class, or {@code null} if a reduce operation is not required in this stage
     */
    protected Class<? extends Writable> getShuffleValueClassOrNull() {
        return null;
    }

    /**
     * Returns the partitioner class.
     * @return the partitioner class, or {@code null} if a reduce operation is not required in this stage
     */
    @SuppressWarnings("rawtypes")
    protected Class<? extends Partitioner> getPartitionerClassOrNull() {
        return null;
    }

    /**
     * Returns the combiner class.
     * @return the combiner class, or {@code null} if a combine operation is not required in this stage
     */
    @SuppressWarnings("rawtypes")
    protected Class<? extends Reducer> getCombinerClassOrNull() {
        return null;
    }

    /**
     * Returns the sort comparator class (for shuffle keys).
     * @return the sort comparator class, or {@code null} if a reduce operation is not required in this stage
     */
    @SuppressWarnings("rawtypes")
    protected Class<? extends RawComparator> getSortComparatorClassOrNull() {
        return null;
    }

    /**
     * Returns the grouping comparator class (for shuffle keys).
     * @return the grouping comparator class, or {@code null} if a reduce operation is not required in this stage
     */
    @SuppressWarnings("rawtypes")
    protected Class<? extends RawComparator> getGroupingComparatorClassOrNull() {
        return null;
    }

    /**
     * Returns the reducer class.
     * @return the reducer class, or {@code null} if a reduce operation is not required in this stage
     */
    @SuppressWarnings("rawtypes")
    protected Class<? extends Reducer> getReducerClassOrNull() {
        return null;
    }

    @Override
    protected int execute(String[] args) throws Exception {
        Configuration conf = getConf();
        conf.set(StageConstants.PROP_BATCH_ID, getBatchId());
        conf.set(StageConstants.PROP_FLOW_ID, getFlowId());
        LOG.info(MessageFormat.format("Initializing Job: batchId={0}, flowId={1}, executionId={2}, stageId={3}",
                getBatchId(), getFlowId(), getExecutionId(), getStageId()));
        Job job = createJob(conf);
        return submit(job);
    }

    /**
     * Creates a new job.
     * @param conf asakusa job configuration
     * @return the created job
     * @throws IOException if failed to create a new job
     * @throws InterruptedException if interrupted while creating {@link Job} object
     * @throws IllegalArgumentException if some parameters were {@code null}
     */
    public Job createJob(Configuration conf) throws IOException, InterruptedException {
        if (conf == null) {
            throw new IllegalArgumentException("conf must not be null"); //$NON-NLS-1$
        }
        Job job = Job.getInstance(conf);
        VariableTable variables = getPathParser(job.getConfiguration());
        configureJobInfo(job, variables);
        configureStageInput(job, variables);
        configureStageOutput(job, variables);
        configureShuffle(job, variables);
        configureStageResource(job, variables);
        configureStage(job, variables);
        return job;
    }

    private int submit(Job job) throws IOException, InterruptedException, ClassNotFoundException {
        String jobRunnerClassName = job.getConfiguration().get(StageConstants.PROP_JOB_RUNNER);
        JobRunner runner = DefaultJobRunner.INSTANCE;
        if (jobRunnerClassName != null) {
            Class<?> jobRunnerClass = job.getConfiguration().getClassByName(jobRunnerClassName);
            runner = (JobRunner) ReflectionUtils.newInstance(jobRunnerClass, job.getConfiguration());
        }
        LOG.info(MessageFormat.format("Submitting Job: {0} (runner: {1})", job.getJobName(), runner));
        long start = System.currentTimeMillis();
        boolean succeed;
        if (RuntimeContext.get().isSimulation()) {
            LOG.info(MessageFormat.format(
                    "Job is skipped because current execution status is in simulation mode: name={0}",
                    job.getJobName()));
            succeed = true;
        } else {
            succeed = runner.run(job);
        }
        long end = System.currentTimeMillis();
        LOG.info(MessageFormat.format("Job Finished: elapsed=[{3}]ms, succeed={2}, id={0}, name={1}",
                job.getJobID(), job.getJobName(), succeed, String.valueOf(end - start)));

        return succeed ? ToolLauncher.JOB_SUCCEEDED : ToolLauncher.JOB_FAILED;
    }

    private void configureJobInfo(Job job, VariableTable variables) {
        Class<?> clientClass = getClass();
        String operationId = getOperationId();

        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Hadoop Job Client: {0}", clientClass.getName())); //$NON-NLS-1$
        }
        String jar = job.getConfiguration().get(PROP_APPLICATION_JAR);
        if (jar == null || (job.getConfiguration() instanceof JobConf) == false) {
            job.setJarByClass(clientClass);
        } else {
            ((JobConf) job.getConfiguration()).setJar(jar);
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Hadoop Job Name: {0}", operationId)); //$NON-NLS-1$
        }
        job.setJobName(operationId);
    }

    private void configureStageInput(Job job, VariableTable variables) {
        List<StageInput> inputList = new ArrayList<>();
        for (StageInput input : getStageInputs()) {
            Class<? extends Mapper<?, ?, ?, ?>> mapperClass = input.getMapperClass();
            String pathString = input.getPathString();
            Class<? extends InputFormat<?, ?>> formatClass = input.getFormatClass();
            String expanded = variables.parse(pathString);
            Map<String, String> attributes = input.getAttributes();
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("Input: path={0}, format={1}, mapper={2}, attributes={3}", //$NON-NLS-1$
                        expanded, formatClass.getName(), mapperClass.getName(), attributes));
            }
            inputList.add(new StageInput(expanded, formatClass, mapperClass, attributes));
        }
        StageInputDriver.set(job, inputList);
        job.setInputFormatClass(StageInputFormat.class);
        job.setMapperClass(StageInputMapper.class);
    }

    @SuppressWarnings("rawtypes")
    private void configureShuffle(Job job, VariableTable variables) {
        Class<? extends Reducer> reducer = getReducerClassOrNull();
        if (reducer != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("Reducer: {0}", reducer.getName())); //$NON-NLS-1$
            }
            job.setReducerClass(reducer);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Reducer: N/A"); //$NON-NLS-1$
            }
            job.setNumReduceTasks(0);
            return;
        }

        Class<? extends Writable> outputKeyClass = or(getShuffleKeyClassOrNull(), NullWritable.class);
        Class<? extends Writable> outputValueClass = or(getShuffleValueClassOrNull(), NullWritable.class);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Shuffle: key={0}, value={1}", //$NON-NLS-1$
                    outputKeyClass.getName(), outputValueClass.getName()));
        }
        job.setMapOutputKeyClass(outputKeyClass);
        job.setMapOutputValueClass(outputValueClass);

        Class<? extends Reducer> combiner = getCombinerClassOrNull();
        if (combiner != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("Combiner: {0}", combiner.getName())); //$NON-NLS-1$
            }
            job.setCombinerClass(combiner);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Combiner: N/A"); //$NON-NLS-1$
            }
        }

        Class<? extends Partitioner> partitioner = getPartitionerClassOrNull();
        if (partitioner != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("Partitioner: {0}", partitioner.getName())); //$NON-NLS-1$
            }
            job.setPartitionerClass(partitioner);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Partitioner: DEFAULT"); //$NON-NLS-1$
            }
        }

        Class<? extends RawComparator> groupingComparator = getGroupingComparatorClassOrNull();
        if (groupingComparator != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("GroupingComparator: {0}", groupingComparator.getName())); //$NON-NLS-1$
            }
            job.setGroupingComparatorClass(groupingComparator);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("GroupingComparator: DEFAULT"); //$NON-NLS-1$
            }
        }

        Class<? extends RawComparator> sortComparator = getSortComparatorClassOrNull();
        if (sortComparator != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("SortComparator: {0}", sortComparator.getName())); //$NON-NLS-1$
            }
            job.setSortComparatorClass(sortComparator);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("SortComparator: DEFAULT"); //$NON-NLS-1$
            }
        }
    }

    private void configureStageResource(Job job, VariableTable variables) throws IOException {
        List<StageResource> resources = getStageResources();
        for (StageResource cache : resources) {
            String resolved = variables.parse(cache.getLocation());
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format("Distributed Cache: {0} @ {1}", //$NON-NLS-1$
                        cache.getName(), resolved));
            }
            if (RuntimeContext.get().isSimulation()) {
                LOG.info("Preparing distributed cache is skipped in simulation mode");
            } else {
                StageResourceDriver.add(job, resolved, cache.getName());
            }
        }
    }

    private void configureStageOutput(Job job, VariableTable variables) throws IOException {
        String outputPath = variables.parse(getStageOutputPath());
        List<StageOutput> outputList = new ArrayList<>();
        for (StageOutput output : getStageOutputs()) {
            String name = output.getName();
            Class<?> keyClass = output.getKeyClass();
            Class<?> valueClass = output.getValueClass();
            Class<? extends OutputFormat<?, ?>> formatClass = output.getFormatClass();
            Map<String, String> attributes = output.getAttributes();
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "Output: path={0}/{1}-*, format={2}, key={3}, value={4}, attributes={5}", //$NON-NLS-1$
                        outputPath, name, formatClass.getName(), keyClass.getName(), valueClass.getName(),
                        attributes));
            }
            outputList.add(new StageOutput(name, keyClass, valueClass, formatClass, attributes));
        }
        StageOutputDriver.set(job, outputPath, outputList);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormatClass(StageOutputFormat.class);
        job.getConfiguration().setClass("mapred.output.committer.class", //$NON-NLS-1$
                LegacyBridgeOutputCommitter.class, org.apache.hadoop.mapred.OutputCommitter.class);
    }

    private <T> T or(T a, T b) {
        if (a != null) {
            return a;
        } else {
            return b;
        }
    }

    private VariableTable getPathParser(Configuration configuration) {
        assert configuration != null;
        VariableTable variables = new VariableTable(RedefineStrategy.IGNORE);
        variables.defineVariable(VAR_USER, getUser());
        variables.defineVariable(VAR_DEFINITION_ID, getDefinitionId());
        variables.defineVariable(VAR_STAGE_ID, getStageId());
        variables.defineVariable(VAR_BATCH_ID, getBatchId());
        variables.defineVariable(VAR_FLOW_ID, getFlowId());
        variables.defineVariable(VAR_EXECUTION_ID, getExecutionId());
        String arguments = configuration.get(PROP_ASAKUSA_BATCH_ARGS);
        if (arguments == null) {
            LOG.warn(MessageFormat.format("Missing configuration \"{0}\" (batch arguments)",
                    PROP_ASAKUSA_BATCH_ARGS));
        } else {
            variables.defineVariables(arguments);
        }

        // replace variables
        configuration.set(PROP_ASAKUSA_BATCH_ARGS, variables.toSerialString());
        return variables;
    }

    private static final class DefaultJobRunner implements JobRunner {

        static final JobRunner INSTANCE = new DefaultJobRunner();

        @Override
        public boolean run(Job job) throws IOException, InterruptedException, ClassNotFoundException {
            job.submit();
            LOG.info(MessageFormat.format("starting job using {0}: {1} ({2})", this, job.getJobID(),
                    job.getJobName()));
            return job.waitForCompletion(true);
        }

        @Override
        public String toString() {
            return "Hadoop job runner"; //$NON-NLS-1$
        }
    }
}