com.savy3.nonequijoin.MapOutputSampler.java Source code

Introduction

Here is the source code for com.savy3.nonequijoin.MapOutputSampler.java
Source

package com.savy3.nonequijoin;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//package org.apache.hadoop.mapreduce.lib.partition;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * Utility for collecting samples and writing a partition file for
 * {@link TotalOrderPartitioner}.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class MapOutputSampler<K, V, OK, OV> extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(MapOutputSampler.class);

    static int printUsage() {
        System.out.println("sampler -r <reduces>\n" + "      [-inFormat <input format class>]\n"
                + "      [-keyClass <map input & output key class>]\n"
                + "      [-splitRandom <double pcnt> <numSamples> <maxsplits> | "
                + "             // Sample from random splits at random (general)\n"
                + "       -splitSample <numSamples> <maxsplits> | "
                + "             // Sample from first records in splits (random data)\n"
                + "       -splitInterval <double pcnt> <maxsplits>]"
                + "             // Sample from splits at intervals (sorted data)");
        System.out.println("Default sampler: -splitRandom 0.1 10000 10");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    public MapOutputSampler(Configuration conf) {
        setConf(conf);
    }

    /**
     * Interface to sample using an
     * {@link org.apache.hadoop.mapreduce.InputFormat}.
     */
    public interface Sampler<K, V> {
        /**
         * For a given job, collect and return a subset of the keys from the
         * input data.
         */
        Map<K, V> getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException;
    }

    /**
     * Samples the first n records from s splits. Inexpensive way to sample
     * random data.
     */
    public static class SplitSampler<K, V> implements Sampler<K, V> {

        protected final int numSamples;
        protected final int maxSplitsSampled;

        /**
         * Create a SplitSampler sampling <em>all</em> splits. Takes the first
         * numSamples / numSplits records from each split.
         * 
         * @param numSamples
         *            Total number of samples to obtain from all selected
         *            splits.
         */
        public SplitSampler(int numSamples) {
            this(numSamples, Integer.MAX_VALUE);
        }

        /**
         * Create a new SplitSampler.
         * 
         * @param numSamples
         *            Total number of samples to obtain from all selected
         *            splits.
         * @param maxSplitsSampled
         *            The maximum number of splits to examine.
         */
        public SplitSampler(int numSamples, int maxSplitsSampled) {
            this.numSamples = numSamples;
            this.maxSplitsSampled = maxSplitsSampled;
        }

        /**
         * From each split sampled, take the first numSamples / numSplits
         * records.
         */
        @SuppressWarnings("unchecked")
        // ArrayList::toArray doesn't preserve type
        public Map<K, V> getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
            List<InputSplit> splits = inf.getSplits(job);
            HashMap<K, V> samples = new HashMap<K, V>(numSamples);
            int splitsToSample = Math.min(maxSplitsSampled, splits.size());
            int samplesPerSplit = numSamples / splitsToSample;
            long records = 0;
            for (int i = 0; i < splitsToSample; ++i) {
                TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        new TaskAttemptID());
                RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
                reader.initialize(splits.get(i), samplingContext);
                while (reader.nextKeyValue()) {
                    samples.put(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null),
                            ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentValue(), null));
                    ++records;
                    if ((i + 1) * samplesPerSplit <= records) {
                        break;
                    }
                }
                reader.close();
            }
            return samples;
        }
    }

    /**
     * Sample from random points in the input. General-purpose sampler. Takes
     * numSamples / maxSplitsSampled inputs from each split.
     */
    public static class RandomSampler<K, V> implements Sampler<K, V> {
        protected double freq;
        protected final int numSamples;
        protected final int maxSplitsSampled;

        /**
         * Create a new RandomSampler sampling <em>all</em> splits. This will
         * read every split at the client, which is very expensive.
         * 
         * @param freq
         *            Probability with which a key will be chosen.
         * @param numSamples
         *            Total number of samples to obtain from all selected
         *            splits.
         */
        public RandomSampler(double freq, int numSamples) {
            this(freq, numSamples, Integer.MAX_VALUE);
        }

        /**
         * Create a new RandomSampler.
         * 
         * @param freq
         *            Probability with which a key will be chosen.
         * @param numSamples
         *            Total number of samples to obtain from all selected
         *            splits.
         * @param maxSplitsSampled
         *            The maximum number of splits to examine.
         */
        public RandomSampler(double freq, int numSamples, int maxSplitsSampled) {
            this.freq = freq;
            this.numSamples = numSamples;
            this.maxSplitsSampled = maxSplitsSampled;
        }

        /**
         * Randomize the split order, then take the specified number of keys
         * from each split sampled, where each key is selected with the
         * specified probability and possibly replaced by a subsequently
         * selected key when the quota of keys from that split is satisfied.
         */
        @SuppressWarnings("unchecked")
        // ArrayList::toArray doesn't preserve type
        public Map<K, V> getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
            System.out.println("Random get Sample");
            List<InputSplit> splits = inf.getSplits(job);
            HashMap<K, V> samples = new HashMap<K, V>(numSamples);
            int splitsToSample = Math.min(maxSplitsSampled, splits.size());

            Random r = new Random();
            long seed = r.nextLong();
            r.setSeed(seed);
            LOG.debug("seed: " + seed);
            // shuffle splits
            for (int i = 0; i < splits.size(); ++i) {
                InputSplit tmp = splits.get(i);
                int j = r.nextInt(splits.size());
                splits.set(i, splits.get(j));
                splits.set(j, tmp);
            }
            // our target rate is in terms of the maximum number of sample
            // splits,
            // but we accept the possibility of sampling additional splits to
            // hit
            // the target sample keyset
            for (int i = 0; i < splitsToSample || (i < splits.size() && samples.size() < numSamples); ++i) {
                TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        new TaskAttemptID());
                RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
                reader.initialize(splits.get(i), samplingContext);
                while (reader.nextKeyValue()) {
                    if (r.nextDouble() <= freq) {
                        if (samples.size() < numSamples) {
                            samples.put(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null),
                                    ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentValue(), null));
                        } else {
                            // When exceeding the maximum number of samples,
                            // replace a
                            // random element with this one, then adjust the
                            // frequency
                            // to reflect the possibility of existing elements
                            // being
                            // pushed out
                            int ind = r.nextInt(numSamples);
                            if (ind != numSamples) {
                                samples.put(
                                        ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null),
                                        ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentValue(),
                                                null));
                            }
                            freq *= (numSamples - 1) / (double) numSamples;
                        }
                    }
                }
                reader.close();
            }
            return samples;
        }
    }

    /**
     * Sample from s splits at regular intervals. Useful for sorted data.
     */
    public static class IntervalSampler<K, V> implements Sampler<K, V> {
        protected final double freq;
        protected final int maxSplitsSampled;

        /**
         * Create a new IntervalSampler sampling <em>all</em> splits.
         * 
         * @param freq
         *            The frequency with which records will be emitted.
         */
        public IntervalSampler(double freq) {
            this(freq, Integer.MAX_VALUE);
        }

        /**
         * Create a new IntervalSampler.
         * 
         * @param freq
         *            The frequency with which records will be emitted.
         * @param maxSplitsSampled
         *            The maximum number of splits to examine.
         * @see #getSample
         */
        public IntervalSampler(double freq, int maxSplitsSampled) {
            this.freq = freq;
            this.maxSplitsSampled = maxSplitsSampled;
        }

        /**
         * For each split sampled, emit when the ratio of the number of records
         * retained to the total record count is less than the specified
         * frequency.
         */
        @SuppressWarnings("unchecked")
        // ArrayList::toArray doesn't preserve type
        public Map<K, V> getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
            List<InputSplit> splits = inf.getSplits(job);
            HashMap<K, V> samples = new HashMap<K, V>();
            int splitsToSample = Math.min(maxSplitsSampled, splits.size());
            long records = 0;
            long kept = 0;
            for (int i = 0; i < splitsToSample; ++i) {
                TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        new TaskAttemptID());
                RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
                reader.initialize(splits.get(i), samplingContext);
                while (reader.nextKeyValue()) {
                    ++records;
                    if ((double) kept / records < freq) {
                        samples.put(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null),
                                ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentValue(), null));
                        ++kept;
                    }
                }
                reader.close();
            }
            return samples;
        }
    }

    /**
     * Write a partition file for the given job, using the Sampler provided.
     * Queries the sampler for a sample keyset, sorts by the output key
     * comparator, selects the keys for each rank, and writes to the destination
     * returned from {@link TotalOrderPartitioner#getPartitionFile}.
     */

    @SuppressWarnings("unchecked")
    // getInputFormat, getOutputKeyComparator
    public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
            throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = job.getConfiguration();
        final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
        int numPartitions = job.getNumReduceTasks();
        HashMap<K, V> samples = (HashMap<K, V>) sampler.getSample(inf, job);
        LOG.info("Using " + samples.size() + " samples");

        // write the input samples in to file <partitionfile>/mapIn
        Path dstOut = new Path(TotalOrderPartitioner.getPartitionFile(conf));

        Path dst = new Path(dstOut, "mapIn");
        FileSystem fs = dst.getFileSystem(conf);
        SequenceFile.Writer sampleWriter = null;
        for (Map.Entry<K, V> sample : samples.entrySet()) {
            sampleWriter = SequenceFile.createWriter(fs, conf, dst, sample.getKey().getClass(),
                    sample.getValue().getClass());
            break;
        }
        for (Map.Entry<K, V> sample : samples.entrySet()) {
            sampleWriter.append(sample.getKey(), sample.getValue());
        }
        sampleWriter.close();
        LOG.info("Sample Input File location " + dst.toString());
        // run map reduce on the samples input
        runMap(job, dst);
    }

    /**
     * Reduce class to write the only MapOut Key to output file
     */
    public static class SampleKeyReducer<OK, OV> extends Reducer<OK, OV, OK, NullWritable> {
        long count = 0;

        @Override
        protected void reduce(OK fileKey, Iterable<OV> values, Context output)
                throws IOException, InterruptedException {
            if (count < Long.parseLong(output.getConfiguration().get("numSamples"))) {
                output.write(fileKey, NullWritable.get());
                count++;
            }
        }
    }

    /**
     * Driver for InputSampler MapReduce Job
     */
    public static void runMap(Job job, Path sampleInputPath)
            throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException {
        LOG.info("Running a MapReduce Job on Sample Input File" + sampleInputPath.toString());

        Configuration conf = new Configuration();
        conf.setBoolean("mapreduce.job.ubertask.enable", true);
        conf.set("numSamples", "" + (job.getNumReduceTasks() - 1));
        Job sampleJob = new Job(conf);
        sampleJob.setMapperClass(job.getMapperClass());
        sampleJob.setReducerClass(SampleKeyReducer.class);
        sampleJob.setJarByClass(job.getMapperClass());
        sampleJob.setMapOutputKeyClass(job.getMapOutputKeyClass());
        sampleJob.setMapOutputValueClass(job.getMapOutputValueClass());
        sampleJob.setOutputKeyClass(job.getMapOutputKeyClass());
        sampleJob.setOutputValueClass(NullWritable.class);
        sampleJob.setInputFormatClass(SequenceFileInputFormat.class);
        sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileInputFormat.addInputPath(sampleJob, sampleInputPath);
        FileSystem fs = FileSystem.get(conf);

        Path out = new Path(sampleInputPath.getParent(), "mapOut");
        fs.delete(out, true);

        SequenceFileOutputFormat.setOutputPath(sampleJob, out);

        sampleJob.waitForCompletion(true);

        LOG.info("Sample MapReduce Job Output File" + out.toString());

        Path partFile = new Path(out, "part-r-00000");
        Path tmpFile = new Path("/_tmp");
        fs.delete(tmpFile, true);
        fs.rename(partFile, tmpFile);
        fs.delete(sampleInputPath.getParent(), true);
        fs.rename(new Path("/_tmp"), sampleInputPath.getParent());

        LOG.info("Sample partitioning file cpied to location " + sampleInputPath.getParent().toString());
    }

    /**
     * Driver for InputSampler from the command line. Configures a JobConf
     * instance and calls {@link #writePartitionFile}.
     */
    public int run(String[] args) throws Exception {
        Job job = new Job(getConf());
        ArrayList<String> otherArgs = new ArrayList<String>();
        Sampler<K, V> sampler = null;

        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-r".equals(args[i])) {
                    job.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-inFormat".equals(args[i])) {
                    job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
                } else if ("-keyClass".equals(args[i])) {
                    job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
                } else if ("-splitSample".equals(args[i])) {
                    int numSamples = Integer.parseInt(args[++i]);
                    int maxSplits = Integer.parseInt(args[++i]);
                    if (0 >= maxSplits)
                        maxSplits = Integer.MAX_VALUE;
                    sampler = new SplitSampler<K, V>(numSamples, maxSplits);
                } else if ("-splitRandom".equals(args[i])) {
                    System.out.println("Random sampling");
                    double pcnt = Double.parseDouble(args[++i]);
                    int numSamples = Integer.parseInt(args[++i]);
                    int maxSplits = Integer.parseInt(args[++i]);
                    if (0 >= maxSplits)
                        maxSplits = Integer.MAX_VALUE;
                    sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits);
                } else if ("-splitInterval".equals(args[i])) {
                    double pcnt = Double.parseDouble(args[++i]);
                    int maxSplits = Integer.parseInt(args[++i]);
                    if (0 >= maxSplits)
                        maxSplits = Integer.MAX_VALUE;
                    sampler = new IntervalSampler<K, V>(pcnt, maxSplits);
                } else {
                    otherArgs.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        if (job.getNumReduceTasks() <= 1) {
            System.err.println("Sampler requires more than one reducer");
            return printUsage();
        }
        if (otherArgs.size() < 2) {
            System.out.println("ERROR: Wrong number of parameters: ");
            return printUsage();
        }
        if (null == sampler) {
            sampler = new RandomSampler<K, V>(0.1, 10000, 10);
        }
        System.out.println("before paths");
        Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
        TotalOrderPartitioner.setPartitionFile(getConf(), outf);
        for (String s : otherArgs) {
            FileInputFormat.addInputPath(job, new Path(s));
        }
        MapOutputSampler.<K, V>writePartitionFile(job, sampler);

        return 0;
    }

    public static void main(String[] args) throws Exception {

        MapOutputSampler<?, ?, ?, ?> sampler = new MapOutputSampler(new Configuration());
        int res = ToolRunner.run(sampler, args);
        System.exit(res);
    }
}