org.kiji.mapreduce.produce.KijiProduceJobBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.kiji.mapreduce.produce.KijiProduceJobBuilder.java

Source

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kiji.mapreduce.produce;

import java.io.IOException;
import java.util.Map;

import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.map.KijiMultithreadedMapper;
import org.apache.hadoop.util.ReflectionUtils;

import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.JobConfigurationException;
import org.kiji.mapreduce.KijiMapReduceJob;
import org.kiji.mapreduce.KijiMapper;
import org.kiji.mapreduce.KijiReducer;
import org.kiji.mapreduce.MapReduceJobOutput;
import org.kiji.mapreduce.framework.KijiConfKeys;
import org.kiji.mapreduce.framework.KijiTableInputJobBuilder;
import org.kiji.mapreduce.kvstore.KeyValueStore;
import org.kiji.mapreduce.output.KijiTableMapReduceJobOutput;
import org.kiji.mapreduce.produce.impl.KijiProducers;
import org.kiji.mapreduce.produce.impl.ProduceMapper;
import org.kiji.mapreduce.reducer.IdentityReducer;
import org.kiji.schema.EntityId;
import org.kiji.schema.KijiDataRequest;
import org.kiji.schema.KijiRowData;
import org.kiji.schema.KijiTable;

/** Builds jobs that run a producer over a Kiji table. */
@ApiAudience.Public
@ApiStability.Evolving
public final class KijiProduceJobBuilder extends KijiTableInputJobBuilder<KijiProduceJobBuilder> {

    /** The default number of threads per mapper to use for running producers. */
    private static final int DEFAULT_NUM_THREADS_PER_MAPPER = 1;

    /** The class of the producer to run. */
    private Class<? extends KijiProducer> mProducerClass;

    /** The number of threads per mapper to use for running producers. */
    private int mNumThreadsPerMapper;

    /** Producer job output. */
    private KijiTableMapReduceJobOutput mJobOutput;

    /** The producer instance. */
    private KijiProducer mProducer;

    /** The mapper instance. */
    private KijiMapper<?, ?, ?, ?> mMapper;

    /** The reducer instance. */
    private KijiReducer<?, ?, ?, ?> mReducer;

    /** The data request for the job's table input. */
    private KijiDataRequest mDataRequest;

    /** Constructs a builder for jobs that run a Kiji producer over a Kiji table. */
    private KijiProduceJobBuilder() {
        mProducerClass = null;
        mNumThreadsPerMapper = DEFAULT_NUM_THREADS_PER_MAPPER;
        mJobOutput = null;
        mProducer = null;
        mMapper = null;
        mReducer = null;
        mDataRequest = null;
    }

    /**
     * Creates a new builder for Kiji produce jobs.
     *
     * @return a new Kiji produce job builder.
     */
    public static KijiProduceJobBuilder create() {
        return new KijiProduceJobBuilder();
    }

    /**
     * Configures the job with the Kiji producer to run.
     *
     * @param producerClass The producer class.
     * @return This builder instance so you may chain configuration method calls.
     */
    public KijiProduceJobBuilder withProducer(Class<? extends KijiProducer> producerClass) {
        mProducerClass = producerClass;
        return this;
    }

    /**
     * Configures the producer output.
     *
     * @param jobOutput Output table of the producer must match the input table.
     * @return this builder instance so you may chain configuration method calls.
     */
    public KijiProduceJobBuilder withOutput(KijiTableMapReduceJobOutput jobOutput) {
        mJobOutput = jobOutput;
        return super.withOutput(jobOutput);
    }

    /**
     * {@inheritDoc}
     *
     * @param jobOutput Output table of the producer must match the input table. Must be an instance
     *     of KijiTableMapReduceJobOutput or a subclass.
     */
    @Override
    public KijiProduceJobBuilder withOutput(MapReduceJobOutput jobOutput) {
        if (jobOutput instanceof KijiTableMapReduceJobOutput) {
            return withOutput((KijiTableMapReduceJobOutput) jobOutput);
        } else {
            // Throw a more helpful debugging message.
            throw new RuntimeException("jobOutput parameter of KijiProduceJobBuilder.withOutput() must "
                    + "be a KijiTableMapReduceJobOutput.");
        }
    }

    /**
     * Sets the number of threads to use for running the producer in parallel.
     *
     * <p>You may use this setting to run multiple instances of your producer in parallel
     * within each map task of the job.  This may useful for increasing throughput when your
     * producer is not CPU bound.</p>
     *
     * @param numThreads The number of produce-runner threads to use per mapper.
     * @return This build instance so you may chain configuration method calls.
     */
    public KijiProduceJobBuilder withNumThreads(int numThreads) {
        Preconditions.checkArgument(numThreads >= 1, "numThreads must be positive, got %d", numThreads);
        mNumThreadsPerMapper = numThreads;
        return this;
    }

    /** {@inheritDoc} */
    @Override
    protected void configureJob(Job job) throws IOException {
        final Configuration conf = job.getConfiguration();

        // Construct the producer instance.
        if (null == mProducerClass) {
            throw new JobConfigurationException("Must specify a producer.");
        }

        // Serialize the producer class name into the job configuration.
        conf.setClass(KijiConfKeys.KIJI_PRODUCER_CLASS, mProducerClass, KijiProducer.class);

        // Write to the table, but make sure the output table is the same as the input table.
        if (!getInputTableURI().equals(mJobOutput.getOutputTableURI())) {
            throw new JobConfigurationException("Output table must be the same as the input table.");
        }

        // Producers should output to HFiles.
        mMapper = new ProduceMapper();
        mReducer = new IdentityReducer<Object, Object>();

        job.setJobName("Kiji produce: " + mProducerClass.getSimpleName());

        mProducer = ReflectionUtils.newInstance(mProducerClass, job.getConfiguration());
        mDataRequest = mProducer.getDataRequest();

        // Configure the table input job.
        super.configureJob(job);
    }

    /** {@inheritDoc} */
    @Override
    protected void configureMapper(Job job) throws IOException {
        super.configureMapper(job);

        // Configure map-parallelism if configured.
        if (mNumThreadsPerMapper > 1) {
            @SuppressWarnings("unchecked")
            Class<? extends Mapper<EntityId, KijiRowData, Object, Object>> childMapperClass = (Class<? extends Mapper<EntityId, KijiRowData, Object, Object>>) mMapper
                    .getClass();
            KijiMultithreadedMapper.setMapperClass(job, childMapperClass);
            KijiMultithreadedMapper.setNumberOfThreads(job, mNumThreadsPerMapper);
            job.setMapperClass(KijiMultithreadedMapper.class);
        }
    }

    /** {@inheritDoc} */
    @Override
    protected Map<String, KeyValueStore<?, ?>> getRequiredStores() throws IOException {
        return mProducer.getRequiredStores();
    }

    /** {@inheritDoc} */
    @Override
    protected KijiMapReduceJob build(Job job) {
        return KijiMapReduceJob.create(job);
    }

    /** {@inheritDoc} */
    @Override
    protected KijiDataRequest getDataRequest() {
        return mDataRequest;
    }

    /** {@inheritDoc} */
    @Override
    protected void validateInputTable(KijiTable inputTable) throws IOException {
        // Validate the Kiji data request against the input table layout:
        super.validateInputTable(inputTable);

        // Validate the producer output column against the output table (ie. the input table):
        KijiProducers.validateOutputColumn(mProducer, inputTable.getLayout());
    }

    /** {@inheritDoc} */
    @Override
    protected KijiMapper<?, ?, ?, ?> getMapper() {
        return mMapper;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiReducer<?, ?, ?, ?> getCombiner() {
        // Producers can't have combiners.
        return null;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiReducer<?, ?, ?, ?> getReducer() {
        return mReducer;
    }

    /** {@inheritDoc} */
    @Override
    protected Class<?> getJarClass() {
        return mProducerClass;
    }
}