com.datasalt.utils.mapred.crossproduct.CrossProductMapRed.java Source code

Introduction

Here is the source code for com.datasalt.utils.mapred.crossproduct.CrossProductMapRed.java
Source

/**
 * Copyright [2011] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.datasalt.utils.mapred.crossproduct;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.datasalt.pangool.tuplemr.mapred.lib.output.HadoopOutputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs;
import com.datasalt.pangool.tuplemr.mapred.lib.output.ProxyOutputFormat;
import com.datasalt.pangool.utils.DCUtils;
import com.datasalt.utils.mapred.crossproduct.io.CrossProductExtraKey;
import com.datasalt.utils.mapred.crossproduct.io.CrossProductPair;
import com.datasalt.utils.mapred.joiner.MultiJoinChanneledMapper;
import com.datasalt.utils.mapred.joiner.MultiJoinDatum;
import com.datasalt.utils.mapred.joiner.MultiJoinPair;
import com.datasalt.utils.mapred.joiner.MultiJoinReducer;
import com.datasalt.utils.mapred.joiner.MultiJoiner;

/**
 * The Cross Product Map/Red allows us to perform the Cross Product of two datasets. For example, if we have set A: { a,
 * b } and set B: { 1, 2, 3 }, the output of this job's A x B cross product would be 6 pairs: { a1, a2, a3, b1, b2, b3
 * }.
 * <p>
 * The job's Map/Reduce nature allows nesting by performing multiple cross-products, depending on the key that we emit
 * in the {@link CrossProductMapper} which in turn uses the {@link MultiJoiner} API. If we don't use always the same
 * key, then we will end up performing as many cross-products as reduce groups we have in the end (one per reduce
 * group).
 * <p>
 * The job's output is of type {@link CrossProductPair}. The user is responsible for deserializing the resulting pair.
 * <p>
 * There is one fundamental thing about this Job is that it can't assure to scale if the <b>RIGHT</b> (second) dataset
 * doesn't fit in memory. In order to scale, an "Extra" output is emitted in case that the RIGHT (second) dataset
 * doesn't fit in memory for a certain key. What we do in this case is split the problem in n smaller problems to be
 * grouped by another reducer in another job - which is called {@link BigGroupsCrossProductJob}. Because most of the
 * groups will fit in memory, the second Map/Reduce job is assured to run with very few data and be very short in time.
 * <p>
 * The property for configuring the size of the RIGHT (second) dataset that is not allowed to fit in memory is specified
 * in SPLIT_DATASET_SIZE_CONF. Its default value is SPLIT_DATASET_SIZE_DEFAULT.
 * <p>
 * Because the job may have two final outputs, the user can join them by calling the method getBigGroupsOutputGlob().
 * Also, the user may control the two-job flow externally by invoking methods like isSecondJobNeeded() or directly
 * invoke memoryAwareRun() and let the API execute the two jobs in a sequentail chain if needed.
 * 
 * @author pere
 * 
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public class CrossProductMapRed<K, V> {

    private static Logger log = LoggerFactory.getLogger(CrossProductMapRed.class);

    public final static String EXTRA_OUTPUT = "EXTRA";

    public final static String SPLIT_DATASET_SIZE_CONF = "cross.product.split.size";
    public final static Integer SPLIT_DATASET_SIZE_DEFAULT = 100000;

    public final static int FIRST_CHANNEL_IN_REDUCER = 0;
    public final static int SECOND_CHANNEL_IN_REDUCER = 1;

    public static enum Counters {
        TOTALGROUPS, NOTFITTINGINMEMORY, EMITTED, EMITTEDEXTRAOUTPUT
    }

    /**
     * Mapper to extend for emitting datasets to the {@link CrossProductReducer}
     * 
     * @author pere
     * 
     * @param <K>
     * @param <V>
     * @param <T>
     */
    public static class CrossProductMapper<K, V, T> extends MultiJoinChanneledMapper<K, V, T> {

    }

    /**
     * This Reducer is called automatically by the Cross Product API.
     * 
     * @author pere
     * 
     */
    public static class CrossProductReducer extends MultiJoinReducer<CrossProductPair, NullWritable> {

        PangoolMultipleOutputs mOs;
        int splitSize;
        List<BytesWritable> inMemoryData = new ArrayList<BytesWritable>(splitSize);
        CrossProductPair toEmit = new CrossProductPair();
        CrossProductExtraKey extraKey = new CrossProductExtraKey();

        byte[] empty = new byte[0];

        protected void setup(Context context) throws IOException, InterruptedException {

            super.setup(context);
            mOs = new PangoolMultipleOutputs(context);
            splitSize = context.getConfiguration().getInt(SPLIT_DATASET_SIZE_CONF, SPLIT_DATASET_SIZE_DEFAULT);
            log.info("Cross product split size is [" + splitSize + "]");
        };

        protected void cleanup(Context context) throws IOException, InterruptedException {
            mOs.close();
        };

        protected void reduce(MultiJoinPair key, Iterable<MultiJoinDatum<?>> values, Context ctx)
                throws IOException, InterruptedException {

            ctx.getCounter("stats", Counters.TOTALGROUPS + "").increment(1);
            inMemoryData.clear(); // TODO try to avoid this and reuse buffers
            int nGroups = 0;

            for (MultiJoinDatum<?> datum : values) {
                /*
                 * We will receive the second dataset first (assigned to the multijoiner's FIRST_CHANNEL). We want it to fit in
                 * memory.
                 */
                if (datum.getChannelId() == FIRST_CHANNEL_IN_REDUCER) {
                    BytesWritable newWritable = new BytesWritable(); // TODO try to avoid this and reuse buffers
                    newWritable.set(datum.getDatum());
                    inMemoryData.add(newWritable);
                    ctx.getCounter("stats", "CHANNEL_" + FIRST_CHANNEL_IN_REDUCER).increment(1);
                    if (inMemoryData.size() == splitSize) {
                        /*
                         * It doesn't fit in memory. The problem needs to be split: Emit to a different output
                         */
                        if (nGroups == 0) {
                            ctx.getCounter("stats", Counters.NOTFITTINGINMEMORY + "").increment(1);
                        }
                        for (BytesWritable data : inMemoryData) {
                            /*
                             * The key for the second job's reducer will have "nGroups" nested in it.
                             */
                            extraKey.setGroupId(nGroups);
                            extraKey.setGroup(key.getMultiJoinGroup());
                            toEmit.setLeft(empty);
                            toEmit.setRight(data);
                            mOs.write(EXTRA_OUTPUT, extraKey, toEmit);
                            ctx.getCounter("stats", Counters.EMITTEDEXTRAOUTPUT + "").increment(1);
                        }
                        inMemoryData.clear();
                        nGroups++;
                    }
                } else {
                    /*
                     * Here we are receiving the first dataset (assigned to the multijoiner's SECOND_CHANNEL). If we have put the
                     * second dataset all in memory, we will perform the cross-product on-the-fly.
                     */
                    ctx.getCounter("stats", "CHANNEL_" + SECOND_CHANNEL_IN_REDUCER).increment(1);
                    if (nGroups == 0) {
                        /*
                         * The problem fits in memory
                         */
                        if (inMemoryData.size() == 0) {
                            ctx.getCounter("stats", "UNMATCHED_" + SECOND_CHANNEL_IN_REDUCER).increment(1);
                        }
                        for (BytesWritable data : inMemoryData) {
                            toEmit.setLeft(datum.getDatum());
                            toEmit.setRight(data);
                            ctx.write(toEmit, NullWritable.get());
                            ctx.getCounter("stats", Counters.EMITTED + "").increment(1);
                        }
                    } else {
                        /*
                         * We already know the problem doesn't fit in memory
                         */
                        if (inMemoryData.size() > 0) {
                            /*
                             * We check this because we might have split size = 1000 but a list of let's say 680 pending to be
                             * flushed. Then, we need to emit pending in-memory data from second dataset
                             */
                            for (BytesWritable data : inMemoryData) {
                                extraKey.setGroupId(nGroups);
                                extraKey.setGroup(key.getMultiJoinGroup());
                                toEmit.setLeft(empty);
                                toEmit.setRight(data);
                                mOs.write(EXTRA_OUTPUT, extraKey, toEmit);
                                ctx.getCounter("stats", Counters.EMITTEDEXTRAOUTPUT + "").increment(1);
                            }
                            inMemoryData.clear();
                        }
                        /*
                         * Emit first dataset to "EXTRA" output
                         */
                        for (int i = 0; i < nGroups; i++) {
                            /*
                             * We have to replicate the dataset as many times as sub-groups we have created for the next Job.
                             */
                            extraKey.setGroupId(i);
                            extraKey.setGroup(key.getMultiJoinGroup());
                            toEmit.setLeft(datum.getDatum());
                            toEmit.setRight(empty);
                            mOs.write(EXTRA_OUTPUT, extraKey, toEmit);
                            ctx.getCounter("stats", Counters.EMITTEDEXTRAOUTPUT + "").increment(1);
                        }
                    }
                }
            }
        }
    }

    private Job job = null;
    private String name;
    private Class<? extends OutputFormat> outputFormat;
    private Configuration conf;
    private Path leftInputPath, rightInputPath, outputPath;
    private Class<? extends InputFormat> leftInputFormat, rightInputFormat;
    private Class<? extends CrossProductMapper> leftInputMapper, rightInputMapper;
    private Class jarByClass;

    public CrossProductMapRed(String name, Configuration conf) {
        this.name = name;
        this.conf = conf;
    }

    public void setLeftInputPath(Path leftInputPath) {
        this.leftInputPath = leftInputPath;
    }

    public void setLeftInputFormat(Class<? extends InputFormat> leftInputFormat) {
        this.leftInputFormat = leftInputFormat;
    }

    public void setRightInputPath(Path rightInputPath) {
        this.rightInputPath = rightInputPath;
    }

    public void setRightInputFormat(Class<? extends InputFormat> rightInputFormat) {
        this.rightInputFormat = rightInputFormat;
    }

    public void setOutputFormat(Class<? extends OutputFormat> outputFormat) {
        this.outputFormat = outputFormat;
    }

    public void setOutputPath(Path outputPath) {
        this.outputPath = outputPath;
    }

    public void setLeftInputMapper(Class<? extends CrossProductMapper> leftInputMapper) {
        this.leftInputMapper = leftInputMapper;
    }

    public void setRightInputMapper(Class<? extends CrossProductMapper> rightInputMapper) {
        this.rightInputMapper = rightInputMapper;
    }

    public void setJarByClass(Class clazz) {
        this.jarByClass = clazz;
    }

    public Job getJob() throws IOException {

        if (job == null) {
            MultiJoiner multiJoiner = new MultiJoiner(name, conf);
            multiJoiner.setReducer(CrossProductReducer.class);
            multiJoiner.setOutputKeyClass(CrossProductPair.class);
            multiJoiner.setOutputValueClass(NullWritable.class);
            multiJoiner.setOutputFormat(outputFormat);
            multiJoiner.setOutputPath(outputPath);
            multiJoiner.setJarByClass((jarByClass != null) ? jarByClass : leftInputMapper);

            Job job = multiJoiner
                    .addChanneledInput(SECOND_CHANNEL_IN_REDUCER, leftInputPath, Object.class, leftInputFormat,
                            leftInputMapper)
                    .addChanneledInput(FIRST_CHANNEL_IN_REDUCER, rightInputPath, Object.class, rightInputFormat,
                            rightInputMapper)
                    .getJob();

            /*
             * Outputs
             */
            String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
            try {
                DCUtils.serializeToDC(new HadoopOutputFormat(SequenceFileOutputFormat.class), uniqueName, conf);
                job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
                job.setOutputFormatClass(ProxyOutputFormat.class);
                PangoolMultipleOutputs.addNamedOutput(job, EXTRA_OUTPUT,
                        new HadoopOutputFormat(SequenceFileOutputFormat.class), CrossProductExtraKey.class,
                        CrossProductPair.class);
            } catch (URISyntaxException e) {
                throw new IOException(e);
            }
            this.job = job;
        }
        return job;

    }

    /**
     * By invoking this method, we leave the API the responsability of running 1 or 2 Jobs, depending on the first Job
     * output.
     * 
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     */
    public void memoryAwareRun() throws IOException, InterruptedException, ClassNotFoundException {
        getJob().waitForCompletion(true);
        if (isSecondJobNeeded()) {
            getSecondJob().waitForCompletion(true);
        }
    }

    /**
     * By invoking this method after the job has run, we can know if we need to run a second Job to perform the full
     * cross-product. It is the user responsability to do something with this information.
     * 
     * 
     * 
     * @throws IOException
     */
    public boolean isSecondJobNeeded() throws IOException {
        return getJob().getCounters().getGroup("stats").findCounter(Counters.NOTFITTINGINMEMORY + "")
                .getValue() > 0;
    }

    /**
     * This Path points to the Glob that will be input to the second Map/Red Job. Useful for unit testing.
     * 
     * 
     */
    public Path getBigGroupsGlob() {
        return new Path(outputPath, CrossProductMapRed.EXTRA_OUTPUT + "/part-*");
    }

    /**
     * This Path points to the output of the second Map/Red Job, which is an extension to the output provided by the user
     * in the first Job.
     * 
     * 
     */
    public Path getBigGroupsOutput() {
        return new Path(outputPath + "_big_groups");
    }

    /**
     * This Path points to the Glob representing the useful output of the second Job.
     * 
     * 
     */
    public Path getBigGroupsOutputGlob() {
        return new Path(getBigGroupsOutput() + "/" + "part*");
    }

    /**
     * This method returns the second Job that can be executed after the first Job if needed.
     * 
     * 
     * @throws IOException
     */
    public Job getSecondJob() throws IOException {
        return BigGroupsCrossProductJob.get(getBigGroupsGlob(), outputPath.toString(), outputFormat,
                outputPath + "_big_groups", conf);
    }
}