Java tutorial
package com.ebay.erl.mobius.core.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.MobiusDelegatingInputFormat; import org.apache.hadoop.mapred.lib.InputSampler.Sampler; import org.apache.hadoop.util.ReflectionUtils; import com.ebay.erl.mobius.core.ConfigureConstants; import com.ebay.erl.mobius.core.datajoin.DataJoinKey; import com.ebay.erl.mobius.core.model.Column; import com.ebay.erl.mobius.core.model.Tuple; import com.ebay.erl.mobius.core.sort.Sorter; import com.ebay.erl.mobius.core.sort.Sorter.Ordering; import com.ebay.erl.mobius.util.SerializableUtil; import com.ebay.erl.mobius.util.Util; /** * Performing sampling for total sort job. * * <p> * This product is licensed under the Apache License, Version 2.0, * available at http://www.apache.org/licenses/LICENSE-2.0. * * This product contains portions derived from Apache hadoop which is * licensed under the Apache License, Version 2.0, available at * http://hadoop.apache.org. * * 2007 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan */ @SuppressWarnings({ "deprecation", "unchecked" }) public class MobiusInputSampler implements Sampler { private double freq; private final int numSamples; private final int maxSplitsSampled; private static final Log LOGGER = LogFactory.getLog(MobiusInputSampler.class); public MobiusInputSampler(double freq, int numSamples, int maxSplitsSampled) { this.freq = freq; this.numSamples = numSamples; this.maxSplitsSampled = maxSplitsSampled; } private AbstractMobiusMapper getMapper(InputFormat inf, InputSplit split, JobConf conf) throws IOException { AbstractMobiusMapper mapper = null; if (inf instanceof MobiusDelegatingInputFormat) { Class<AbstractMobiusMapper> mapperClass = ((MobiusDelegatingInputFormat) inf).getMapper(split, conf); mapper = ReflectionUtils.newInstance(mapperClass, conf); } else { Class<? extends AbstractMobiusMapper> mapperClass = (Class<? extends AbstractMobiusMapper>) Util .getClass(conf.get(ConfigureConstants.MAPPER_CLASS)); mapper = ReflectionUtils.newInstance(mapperClass, conf); } return mapper; } @Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); } private DataJoinKey getKey(Tuple tuple, Sorter[] sorter, Byte datasetID, AbstractMobiusMapper mapper, Configuration conf) { Tuple columnsUsedToSort = new Tuple(); for (Sorter aSorter : sorter) { String name = aSorter.getColumn(); Object value = tuple.get(name); columnsUsedToSort.insert(name, value); } DataJoinKey nkey = new DataJoinKey(datasetID, columnsUsedToSort, mapper.extractSortValueKeyword(tuple), mapper.getSortValueComparator()); nkey.setConf(conf); return nkey; } private static class SamplingOutputCollector implements OutputCollector<Tuple, Tuple> { private List<Tuple> keys = new ArrayList<Tuple>(); @Override public void collect(Tuple key, Tuple value) throws IOException { this.keys.add(key); } // to be called for every new key public void clear() { this.keys.clear(); } public List<Tuple> getOutKey() { return this.keys; } } }