org.kiji.scoring.batch.ScoreFunctionJobBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.kiji.scoring.batch.ScoreFunctionJobBuilder.java

Source

/**
 * (c) Copyright 2013 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kiji.scoring.batch;

import java.io.IOException;
import java.util.Map;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.SerializationUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.map.KijiMultithreadedMapper;
import org.apache.hadoop.util.ReflectionUtils;

import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.JobConfigurationException;
import org.kiji.mapreduce.KijiMapReduceJob;
import org.kiji.mapreduce.KijiMapper;
import org.kiji.mapreduce.KijiReducer;
import org.kiji.mapreduce.MapReduceJobOutput;
import org.kiji.mapreduce.framework.KijiTableInputJobBuilder;
import org.kiji.mapreduce.kvstore.KeyValueStore;
import org.kiji.mapreduce.kvstore.KeyValueStoreReaderFactory;
import org.kiji.mapreduce.output.KijiTableMapReduceJobOutput;
import org.kiji.mapreduce.reducer.IdentityReducer;
import org.kiji.schema.EntityId;
import org.kiji.schema.KijiColumnName;
import org.kiji.schema.KijiDataRequest;
import org.kiji.schema.KijiRowData;
import org.kiji.scoring.FreshenerContext;
import org.kiji.scoring.ScoreFunction;
import org.kiji.scoring.batch.impl.ScoreFunctionMapper;
import org.kiji.scoring.impl.InternalFreshenerContext;

/**
 * Builder for KijiMapReduceJobs which run ScoreFunction implementations across all rows of a table.
 *
 * A ScoreFunction MapReduce job runs a ScoreFunction against all rows within the specified range
 * of a table. It runs the ScoreFunction as if it was attached with an
 * {@link org.kiji.scoring.lib.AlwaysFreshen} policy which provides no additional parameters or
 * KeyValueStores.
 *
 * ScoreFunction MapReduce jobs require that all information available to a ScoreFunction via the
 * FreshenerContext be specified during construction of the job. This information includes:
 * <ul>
 *   <li>attached column (normally this would be the column where the Freshener is attached)</li>
 *   <li>string-string parameter mapping (defaults to an empty map)</li>
 *   <li>
 *     client data request (normally this would be the request which triggered the run of the
 *     Freshener) (defaults to an empty data request)
 *   </li>
 *   <li>
 *     KeyValueStores will be constructed from the return value of the ScoreFunction's
 *     getRequiredStores method optionally overridden by KeyValueStores specified to
 *     {@link #withKeyValueStoreOverrides(java.util.Map)}. This optional overriding makes up for the
 *     lack of overrides normally provided by the KijiFreshnessPolicy. (defaults to an empty map)
 *   </li>
 * </ul>
 *
 * <p>
 *   Example usage:
 *   <pre>
 *     final KijiMapReduceJob sfJob = ScoreFunctionJobBuilder.create()
 *         .withConf(conf)
 *         .withInputTable(inputTableURI)
 *         .withAttachedColumn(new KijiColumnName("family:qualifier"))
 *         .withScoreFunctionClass(MyScoreFunction.class)
 *         .withOutput(MapReduceJobOutputs.newDirectKijiTableMapReduceJobOutput(inputTableURI))
 *         .build();
 *     sfJob.run();
 *   </pre>
 * </p>
 */
@ApiAudience.Public
@ApiStability.Experimental
public final class ScoreFunctionJobBuilder extends KijiTableInputJobBuilder<ScoreFunctionJobBuilder> {

    public static final String SCORE_FUNCTION_CLASS_CONF_KEY = "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_class_conf_key";
    public static final String SCORE_FUNCTION_PARAMETERS_CONF_KEY = "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_parameters_conf_key";
    public static final String SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY = "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_attached_column_conf_key";
    public static final String SCORE_FUNCTION_CLIENT_DATA_REQUEST_CONF_KEY = "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_client_data_request_conf_key";

    private static final Gson GSON = new Gson();
    private static final KijiDataRequest DEFAULT_CLIENT_REQUEST = KijiDataRequest.builder().build();
    private static final Map<String, String> DEFAULT_PARAMETERS = Maps.newHashMap();
    private static final int DEFAULT_NUM_THREADS_PER_MAPPER = 1;

    private Class<? extends ScoreFunction<?>> mScoreFunctionClass = null;
    private KijiTableMapReduceJobOutput mJobOutput = null;
    private ScoreFunction<?> mScoreFunction = null;
    private KijiMapper<?, ?, ?, ?> mMapper = null;
    private KijiReducer<?, ?, ?, ?> mReducer = null;
    private KijiDataRequest mScoreFunctionDataRequest = null;
    private int mNumThreadsPerMapper = DEFAULT_NUM_THREADS_PER_MAPPER;
    private KijiDataRequest mClientDataRequest = null;
    private KijiColumnName mAttachedColumn = null;
    private Map<String, String> mParameters = null;
    private Map<String, KeyValueStore<?, ?>> mKeyValueStoreOverrides = null;

    /** Private constructor. Use {@link #create()}. */
    private ScoreFunctionJobBuilder() {
    }

    /**
     * Create a new ScoreFunctionJobBuilder.
     *
     * @return a new ScoreFunctionJobBuilder.
     */
    public static ScoreFunctionJobBuilder create() {
        return new ScoreFunctionJobBuilder();
    }

    /**
     * Configure the Job to run the given ScoreFunction implementation to generate scores.
     *
     * @param scoreFunctionClass class of the ScoreFunction implementation with which to generate
     *     scores.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withScoreFunctionClass(
            final Class<? extends ScoreFunction<?>> scoreFunctionClass) {
        mScoreFunctionClass = scoreFunctionClass;
        return this;
    }

    /**
     * Configure the Job to output using the given KijiTableMapReduceJobOutput. The output table must
     * match the input table.
     *
     * @param jobOutput KijiTableMapReduceJobOutput which defines the output from this mapreduce job.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withOutput(final KijiTableMapReduceJobOutput jobOutput) {
        mJobOutput = jobOutput;
        return super.withOutput(jobOutput);
    }

    /** {@inheritDoc} */
    @Override
    public ScoreFunctionJobBuilder withOutput(final MapReduceJobOutput jobOutput) {
        if (jobOutput instanceof KijiTableMapReduceJobOutput) {
            return withOutput((KijiTableMapReduceJobOutput) jobOutput);
        } else {
            throw new RuntimeException("jobOutput parameter of ScoreFunctionJobBuilder.withOutput() must "
                    + "be a KijiTableMapReduceJobOutput.");
        }
    }

    /**
     * Sets the number of threads to use for running the ScoreFunction in parallel.
     *
     * <p>
     *   You may use this setting to run multiple instances of your ScoreFunction in parallel within
     *   each map task of the job. This may be useful for increasing your throughput when your
     *   ScoreFunction is not CPU bound.
     * </p>
     *
     * @param numThreads the number of ScoreFunctions which will be run in parallel per mapper.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withNumThreadsPerMapper(final int numThreads) {
        Preconditions.checkArgument(0 < numThreads, "numThreads must be positive, got %d", numThreads);
        mNumThreadsPerMapper = numThreads;
        return this;
    }

    /**
     * Configure the Job to include the given client data request. This request will be visible to the
     * ScoreFunction via {@link org.kiji.scoring.FreshenerContext#getClientRequest()}. If unspecified,
     * an empty data request will be used.
     *
     * @param clientDataRequest KijiDataRequest which will be visible to the ScoreFunction.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withClientDataRequest(final KijiDataRequest clientDataRequest) {
        mClientDataRequest = clientDataRequest;
        return this;
    }

    /**
     * Configure the Job to include the given attached column. This column will be visible to the
     * ScoreFunction via {@link org.kiji.scoring.FreshenerContext#getAttachedColumn()} and will be
     * used as the output column for values written by the ScoreFunction. The schema of this column
     * should be compatible with the schema of values output by the ScoreFunction.
     *
     * @param attachedColumn column to which to write ScoreFunction return values.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withAttachedColumn(final KijiColumnName attachedColumn) {
        mAttachedColumn = attachedColumn;
        return this;
    }

    /**
     * Configure the Job to include the given parameters. These parameters should be the equivalent of
     * merging request and attachment time parameters from the real time execution of a Freshener.
     *
     * @param parameters parameters which will be available to the ScoreFunction via the
     *     FreshenerContext.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withParameters(final Map<String, String> parameters) {
        mParameters = parameters;
        return this;
    }

    /**
     * Configures the Job to use the given KeyValueStores in preference to those requested by the
     * ScoreFunction when there are name conflicts. This mirrors the KeyValueStore override behavior
     * provided by a KijiFreshnessPolicy. These KeyValueStores will only replace KeyValueStores
     * requested by the ScoreFunction with the same name. KeyValueStores requested by the
     * ScoreFunction whose names are not shadowed in this map will be available as normal.
     *
     * @param kvStoreOverrides KeyValueStores which will take precedence over stores requested by the
     *     ScoreFunction.
     * @return this builder.
     */
    public ScoreFunctionJobBuilder withKeyValueStoreOverrides(
            final Map<String, KeyValueStore<?, ?>> kvStoreOverrides) {
        mKeyValueStoreOverrides = kvStoreOverrides;
        return this;
    }

    /** {@inheritDoc} */
    @Override
    protected void configureJob(final Job job) throws IOException {
        if (null == mScoreFunctionClass) {
            throw new JobConfigurationException("Must specify a ScoreFunction class.");
        }
        if (null == mClientDataRequest) {
            mClientDataRequest = DEFAULT_CLIENT_REQUEST;
        }
        if (null == mAttachedColumn) {
            throw new JobConfigurationException("Must specified an AttachedColumn.");
        }
        if (null == mParameters) {
            mParameters = DEFAULT_PARAMETERS;
        }

        final Configuration conf = job.getConfiguration();
        conf.setClass(SCORE_FUNCTION_CLASS_CONF_KEY, mScoreFunctionClass, ScoreFunction.class);
        if (!getInputTableURI().equals(mJobOutput.getOutputTableURI())) {
            throw new JobConfigurationException(
                    String.format("Output table must be the same as the input" + "table. Got input: %s output: %s",
                            getInputTableURI(), mJobOutput.getOutputTableURI()));
        }
        conf.set(SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY, mAttachedColumn.getName());
        conf.set(SCORE_FUNCTION_PARAMETERS_CONF_KEY, GSON.toJson(mParameters, Map.class));
        conf.set(SCORE_FUNCTION_CLIENT_DATA_REQUEST_CONF_KEY,
                Base64.encodeBase64String(SerializationUtils.serialize(mClientDataRequest)));
        mMapper = new ScoreFunctionMapper();
        mReducer = new IdentityReducer<Object, Object>();
        job.setJobName("Kiji ScoreFunction: " + mScoreFunctionClass.getSimpleName());
        mScoreFunction = ReflectionUtils.newInstance(mScoreFunctionClass, conf);
        final FreshenerContext context = InternalFreshenerContext.create(mClientDataRequest, mAttachedColumn,
                mParameters, Maps.<String, String>newHashMap(),
                KeyValueStoreReaderFactory.create(getRequiredStores()));
        mScoreFunctionDataRequest = mScoreFunction.getDataRequest(context);

        super.configureJob(job);
    }

    /** {@inheritDoc} */
    @Override
    protected void configureMapper(final Job job) throws IOException {
        super.configureMapper(job);

        if (mNumThreadsPerMapper > 1) {
            @SuppressWarnings("unchecked")
            Class<? extends Mapper<EntityId, KijiRowData, Object, Object>> childMapperClass = (Class<? extends Mapper<EntityId, KijiRowData, Object, Object>>) mMapper
                    .getClass();
            KijiMultithreadedMapper.setMapperClass(job, childMapperClass);
            KijiMultithreadedMapper.setNumberOfThreads(job, mNumThreadsPerMapper);
            job.setMapperClass(KijiMultithreadedMapper.class);
        }
    }

    /** {@inheritDoc} */
    @Override
    protected Map<String, KeyValueStore<?, ?>> getRequiredStores() {
        final FreshenerContext context = InternalFreshenerContext.create(mAttachedColumn, mParameters);
        final Map<String, KeyValueStore<?, ?>> combinedStores = Maps.newHashMap();
        combinedStores.putAll(mScoreFunction.getRequiredStores(context));
        if (null != mKeyValueStoreOverrides) {
            combinedStores.putAll(mKeyValueStoreOverrides);
        }
        return combinedStores;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiDataRequest getDataRequest() {
        return mScoreFunctionDataRequest;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiMapReduceJob build(final Job job) {
        return KijiMapReduceJob.create(job);
    }

    /** {@inheritDoc} */
    @Override
    protected KijiMapper<?, ?, ?, ?> getMapper() {
        return mMapper;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiReducer<?, ?, ?, ?> getCombiner() {
        return null;
    }

    /** {@inheritDoc} */
    @Override
    protected KijiReducer<?, ?, ?, ?> getReducer() {
        return mReducer;
    }

    /** {@inheritDoc} */
    @Override
    protected Class<?> getJarClass() {
        return mScoreFunctionClass;
    }
}