org.kiji.mapreduce.pivot.KijiPivotJobBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.kiji.mapreduce.pivot.KijiPivotJobBuilder.java

Source

/**
 * (c) Copyright 2013 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kiji.mapreduce.pivot;

import java.io.IOException;
import java.util.Map;

import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.map.KijiMultithreadedMapper;
import org.apache.hadoop.util.ReflectionUtils;

import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.JobConfigurationException;
import org.kiji.mapreduce.KijiMapReduceJob;
import org.kiji.mapreduce.KijiMapper;
import org.kiji.mapreduce.KijiReducer;
import org.kiji.mapreduce.MapReduceJobOutput;
import org.kiji.mapreduce.framework.KijiConfKeys;
import org.kiji.mapreduce.framework.KijiTableInputJobBuilder;
import org.kiji.mapreduce.kvstore.KeyValueStore;
import org.kiji.mapreduce.output.KijiTableMapReduceJobOutput;
import org.kiji.mapreduce.pivot.impl.PivoterMapper;
import org.kiji.mapreduce.reducer.IdentityReducer;
import org.kiji.schema.EntityId;
import org.kiji.schema.KijiDataRequest;
import org.kiji.schema.KijiRowData;

/**
 * Builds jobs that run a {@link KijiPivoter} over a Kiji table.
 *
 * <p>
 *   {@link KijiPivoter} scans the rows from an input KijiTable and writes cells
 *   into an output KijiTable. The input and the output KijiTable may or may not be the same.
 * </p>
 *
 * <p>
 *   Use the {@link KijiPivotJobBuilder} to configure a {@link KijiPivoter} job, by specifying:
 *   <ul>
 *     <li> the {@link KijiPivoter} class to run over the input KijiTable; </li>
 *     <li> the input {@link org.kiji.schema.KijiTable} to be processed by the {@link KijiPivoter};
 *     </li>
 *     <li> the output {@link org.kiji.schema.KijiTable} the {@link KijiPivoter} writes to. </li>
 *   </ul>
 * </p>
 *
 * <p> Example:
 * <pre><blockquote>
 *   final Configuration conf = ...;
 *   final KijiURI inputTableURI = ...;
 *   final KijiURI outputTableURI = ...;
 *   final KijiMapReduceJob job = KijiPivotJobBuilder.create()
 *       .withConf(conf)
 *       .withPivoter(SomePivoter.class)
 *       .withInputTable(inputTableURI)
 *       .withOutput(MapReduceJobOutputs
 *           .newHFileMapReduceJobOutput(outputTableURI, hfilePath))
 *       .build();
 *   job.run();
 * </blockquote></pre>
 * </p>
 */
@ApiAudience.Public
@ApiStability.Experimental
public final class KijiPivotJobBuilder extends KijiTableInputJobBuilder<KijiPivotJobBuilder> {

    /** Default number of threads per mapper to use for running pivoters. */
    private static final int DEFAULT_NUM_THREADS_PER_MAPPER = 1;

    /** {@link KijiPivoter} class to run over the table. */
    private Class<? extends KijiPivoter> mPivoterClass;

    /** Configured number of threads per mapper to use for running pivoters. */
    private int mNumThreadsPerMapper;

    /** Pivoter to run for this KijiMR pivot job. */
    private KijiPivoter mPivoter;

    /** Hadoop mapper to run for this KijiMR pivot job. */
    private KijiMapper<?, ?, ?, ?> mMapper;

    /** Hadoop reducer to run for this KijiMR pivot job. */
    private KijiReducer<?, ?, ?, ?> mReducer;

    /** Specification of the data requested for this pivot job. */
    private KijiDataRequest mDataRequest;

    /** Constructs a builder for jobs that run a Kiji table-mapper over a Kiji table. */
    private KijiPivotJobBuilder() {
        mPivoterClass = null;
        mNumThreadsPerMapper = DEFAULT_NUM_THREADS_PER_MAPPER;
        mPivoter = null;
        mMapper = null;
        mReducer = null;
        mDataRequest = null;
    }

    /**
     * Creates a new builder for a {@link KijiPivoter} job.
     *
     * @return a new builder for a {@link KijiPivoter} job.
     */
    public static KijiPivotJobBuilder create() {
        return new KijiPivotJobBuilder();
    }

    /**
     * Configures the job with the {@link KijiPivoter} to run.
     *
     * @param pivoterClass {@link KijiPivoter} class to run over the input Kiji table.
     * @return this builder instance.
     */
    public KijiPivotJobBuilder withPivoter(Class<? extends KijiPivoter> pivoterClass) {
        mPivoterClass = pivoterClass;
        return this;
    }

    /**
     * Configures the output table of this pivoter.
     *
     * @param jobOutput Kiji table the pivoter writes to.
     * @return this builder instance.
     */
    public KijiPivotJobBuilder withOutput(KijiTableMapReduceJobOutput jobOutput) {
        return super.withOutput(jobOutput);
    }

    /**
     * {@inheritDoc}
     *
     * <p> The output of a pivoter must be a KijiTable. </p>
     */
    @Override
    public KijiPivotJobBuilder withOutput(MapReduceJobOutput jobOutput) {
        if (jobOutput instanceof KijiTableMapReduceJobOutput) {
            return withOutput((KijiTableMapReduceJobOutput) jobOutput);
        } else {
            throw new RuntimeException("KijiTableRWMapper must output to a Kiji table.");
        }
    }

    /**
     * Sets the number of threads to use for running the producer in parallel.
     *
     * <p>You may use this setting to run multiple instances of the pivoter in parallel
     * within each map task of the job.  This may useful for increasing throughput when the
     * pivoter is not CPU bound.</p>
     *
     * @param numThreads Number of threads to use per mapper.
     * @return this build instance.
     */
    public KijiPivotJobBuilder withNumThreads(int numThreads) {
        Preconditions.checkArgument(numThreads >= 1, "numThreads must be positive, got %d", numThreads);
        mNumThreadsPerMapper = numThreads;
        return this;
    }

    /** {@inheritDoc} */
    @Override
    protected void configureJob(Job job) throws IOException {
        final Configuration conf = job.getConfiguration();

        if (null == mPivoterClass) {
            throw new JobConfigurationException("Must specify a KijiPivoter class.");
        }

        // Serialize the pivoter class name into the job configuration.
        conf.setClass(KijiConfKeys.KIJI_PIVOTER_CLASS, mPivoterClass, KijiPivoter.class);

        // Producers should output to HFiles.
        mMapper = new PivoterMapper();
        mReducer = new IdentityReducer<Object, Object>();

        job.setJobName("KijiPivoter: " + mPivoterClass.getSimpleName());

        mPivoter = ReflectionUtils.newInstance(mPivoterClass, job.getConfiguration());
        mDataRequest = mPivoter.getDataRequest();

        // Configure the table input job.
        super.configureJob(job);
    }

    /** {@inheritDoc} */
    @Override
    protected void configureMapper(Job job) throws IOException {
        super.configureMapper(job);

        // Configure map-parallelism if configured.
        if (mNumThreadsPerMapper > 1) {
            @SuppressWarnings("unchecked")
            Class<? extends Mapper<EntityId, KijiRowData, Object, Object>> childMapperClass = (Class<? extends Mapper<EntityId, KijiRowData, Object, Object>>) mMapper
                    .getClass();
            KijiMultithreadedMapper.setMapperClass(job, childMapperClass);
            KijiMultithreadedMapper.setNumberOfThreads(job, mNumThreadsPerMapper);
            job.setMapperClass(KijiMultithreadedMapper.class);
        }
    }

    /** {@inheritDoc} */
    @Override
    protected Map<String, KeyValueStore<?, ?>> getRequiredStores() throws IOException {
        return mPivoter.getRequiredStores();
    }

    /** {@inheritDoc} */
    @Override
    protected KijiMapReduceJob build(Job job) {
        return KijiMapReduceJob.create(job);
    }

    /** {@inheritDoc} */
    @Override
    protected KijiDataRequest getDataRequest() {
        return mDataRequest;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiMapper<?, ?, ?, ?> getMapper() {
        return mMapper;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiReducer<?, ?, ?, ?> getCombiner() {
        // A pivoter cannot have combiners.
        return null;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiReducer<?, ?, ?, ?> getReducer() {
        return mReducer;
    }

    /** {@inheritDoc} */
    @Override
    protected Class<?> getJarClass() {
        return mPivoterClass;
    }
}