Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.DataInput; import java.io.DataOutput; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.InetAddress; import java.net.URI; import java.net.URL; import java.net.URLClassLoader; import java.net.URLConnection; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ChecksumFileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSError; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableFactories; import org.apache.hadoop.io.WritableFactory; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.io.serializer.Deserializer; import org.apache.hadoop.io.serializer.SerializationFactory; import org.apache.hadoop.io.serializer.Serializer; import org.apache.hadoop.mapred.IFile.InMemoryReader; import org.apache.hadoop.mapred.IFile.Reader; import org.apache.hadoop.mapred.IFile.Writer; import org.apache.hadoop.mapred.Merger.Segment; import org.apache.hadoop.mapred.SortedRanges.SkipRangeIterator; import org.apache.hadoop.mapred.TaskTracker.TaskInProgress; import org.apache.hadoop.mapred.iterative.LoopReduceCacheFilter; import org.apache.hadoop.mapred.iterative.LoopReduceCacheSwitch; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsRecord; import org.apache.hadoop.metrics.MetricsUtil; import org.apache.hadoop.metrics.Updater; import org.apache.hadoop.util.Progress; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; /** A Reduce task. */ class RecoverReducerTask extends Task { static { // register a ctor WritableFactories.setFactory(RecoverReducerTask.class, new WritableFactory() { public Writable newInstance() { return new RecoverReducerTask(); } }); } private static final Log LOG = LogFactory.getLog(RecoverReducerTask.class.getName()); private int numMaps; private ReduceCopier reduceCopier; private CompressionCodec codec; { getProgress().setStatus("recover"); setPhase(TaskStatus.Phase.SHUFFLE); // phase to start with } private Progress copyPhase; private Progress sortPhase; private Progress reducePhase; private Counters.Counter reduceInputKeyCounter = getCounters().findCounter(Counter.REDUCE_INPUT_GROUPS); private Counters.Counter reduceInputValueCounter = getCounters().findCounter(Counter.REDUCE_INPUT_RECORDS); private Counters.Counter reduceOutputCounter = getCounters().findCounter(Counter.REDUCE_OUTPUT_RECORDS); private Counters.Counter reduceCombineOutputCounter = getCounters().findCounter(Counter.COMBINE_OUTPUT_RECORDS); private Counters.Counter reduceShuffleBytes = getCounters().findCounter(Counter.REDUCE_SHUFFLE_BYTES); /** * HaLoop: the iteration to be recovered */ private int recoverIteration; /** * HaLoop: the step to be recovered */ private int recoverStep; /** * HaLoop: loop cache switch */ private LoopReduceCacheSwitch loopCacheControl; /** * filter what to cache and what not */ private LoopReduceCacheFilter loopCacheFilter; // A custom comparator for map output files. Here the ordering is determined // by the file's size and path. In case of files with same size and // different // file paths, the first parameter is considered smaller than the second // one. // In case of files with same size and path are considered equal. private Comparator<FileStatus> mapOutputFileComparator = new Comparator<FileStatus>() { public int compare(FileStatus a, FileStatus b) { if (a.getLen() < b.getLen()) return -1; else if (a.getLen() == b.getLen()) if (a.getPath().toString().equals(b.getPath().toString())) return 0; else return -1; else return 1; } }; // A sorted set for keeping a set of map output files on disk private final SortedSet<FileStatus> mapOutputFilesOnDisk = new TreeSet<FileStatus>(mapOutputFileComparator); public RecoverReducerTask() { super(); } /** * recovery: the latest cache written iteration map schedule */ List<MapScheduleInfo> mapSchedules = new ArrayList<MapScheduleInfo>(); public RecoverReducerTask(String jobFile, TaskAttemptID taskId, int partition, int numMaps) { super(jobFile, taskId, partition); } /** * HaLoop: refresh schedule log * * @param conf */ private void loadJobConf() { try { FileSystem fs = FileSystem.get(conf); Path path = new Path(MRConstants.SCHEDULE_LOG_DIR + "/" + this.getJobID() + "/conf.job"); FSDataInputStream in = fs.open(path); JobConf job = new JobConf(); job.readFields(in); int num = job.getNumberOfLoopBodySteps(); for (int i = 0; i < num; i++) conf.setStepConf(i, job.getStepConf(i)); in.close(); } catch (Exception e) { e.printStackTrace(); } } /** * recover from schedule log */ private void recoverMappersFromScheduleLog() throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(conf); Path path = new Path(MRConstants.SCHEDULE_LOG_DIR + "/" + getJobID().toString() + "/schedule.log"); FSDataInputStream scheduleLog = fs.open(path); mapSchedules.clear(); while (scheduleLog.available() > 0) { MapScheduleInfo msi = new MapScheduleInfo(); msi.readFields(scheduleLog); mapSchedules.add(msi); } scheduleLog.close(); /** * num of maps */ this.numMaps = mapSchedules.size(); InetAddress addr = InetAddress.getLocalHost(); String hostname = addr.getHostName(); List<MapScheduleInfo> recoverMappers = new ArrayList<MapScheduleInfo>(); String httpAddress = conf.get("mapred.task.tracker.http.address"); String[] ipAndPort = httpAddress.split(":"); System.out.println("current host " + hostname); System.out.println("recover for host " + this.getRecoverFromTaskTracker()); if (this.getNodeFailure()) { for (MapScheduleInfo msi : mapSchedules) { /** * find tasktracker to be recovered */ if (msi.getHttpHost().contains(this.getRecoverFromTaskTracker()) || this.getRecoverFromTaskTracker().contains(msi.getHttpHost())) { recoverMappers.add(msi); /** * replace it with a new http address */ msi.setHttpAddress("http://" + hostname + ":" + ipAndPort[1]); // System.out.println("recover from " + msi.getHttpHost()); } } } /** * find latest cached step */ int numSteps = conf.getNumberOfLoopBodySteps(); int cachedIteration = iteration; int cachedStep = step; for (int latest = round; latest >= 0; latest--) { if (cachedStep > 0) { cachedStep--; } else { cachedStep = numSteps - 1; cachedIteration--; } if (loopCacheControl.isCacheWritten(conf, cachedIteration, cachedStep)) break; } this.recoverIteration = cachedIteration; this.recoverStep = cachedStep; this.iteration = recoverIteration; this.step = recoverStep; this.round = recoverIteration * numOfLoopBodySteps + recoverStep; TaskReporter reporter = new TaskReporter(getProgress(), umbilical); reporter.startCommunicationThread(); boolean useNewApi = conf.getUseNewReducer(); /** * set the recovered iteration */ conf.setCurrentIterationAndStep(iteration, step); if (this.getNodeFailure()) { int numConcurrentMappers = 10; JobConf mapConf = conf.duplicate(); mapConf.setInt("io.sort.mb", 100 / numConcurrentMappers); Thread[] mthreads = new Thread[numConcurrentMappers]; int i = 0; int num = recoverMappers.size(); for (i = 0; i < num;) { // execute mappers in batch int j = 0; for (; j < numConcurrentMappers && i < num; j++) { MapScheduleInfo msi = recoverMappers.get(i); MapTask mt = new MapTask(getJobFile(), msi.getTaskAttemptID(), msi.getPartition(), msi.getInputSplit().getClassName(), msi.getInputSplit().getBytes()); // run map task locally to recover mt.setConf(mapConf); mt.setCurrentIteration(iteration); mt.setCurrentStep(step); mt.setRound(); mt.initialize(mapConf, this.getJobID(), reporter, useNewApi); mthreads[j] = new Thread(new MapperRecoverThread(mapConf, umbilical, mt)); mthreads[i].start(); i++; } for (int k = 0; k < j; k++) { Thread thread = mthreads[k]; try { thread.join(); } catch (InterruptedException e) { e.printStackTrace(); } } } System.out.flush(); System.out.println("recover mappers are finished"); /** * overwrite the scheduling log */ FSDataOutputStream jobLog = fs.create(path); for (MapScheduleInfo msi : mapSchedules) { msi.write(jobLog); } jobLog.close(); System.out.println("recover schedule log are finished"); } } class MapperRecoverThread implements Runnable { private MapTask mt; private JobConf job; private TaskUmbilicalProtocol umbilical; public MapperRecoverThread(JobConf job, TaskUmbilicalProtocol umbilical, MapTask mt) { this.mt = mt; this.job = job; this.umbilical = umbilical; } @Override public void run() { try { mt.run(job, umbilical); } catch (Exception e) { e.printStackTrace(); } } } private CompressionCodec initCodec() { // check if map-outputs are to be compressed if (conf.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = conf.getMapOutputCompressorClass(DefaultCodec.class); return ReflectionUtils.newInstance(codecClass, conf); } return null; } @Override public TaskRunner createRunner(TaskTracker tracker, TaskInProgress tip) throws IOException { return new ReduceTaskRunner(tip, tracker, this.conf); } @Override public boolean isMapTask() { return false; } public int getNumMaps() { return numMaps; } /** * Localize the given JobConf to be specific for this task. */ @Override public void localizeConfiguration(JobConf conf) throws IOException { super.localizeConfiguration(conf); conf.setNumMapTasks(numMaps); } @Override public void write(DataOutput out) throws IOException { super.write(out); out.writeInt(numMaps); // write the number of maps } @Override public void readFields(DataInput in) throws IOException { super.readFields(in); numMaps = in.readInt(); } // Get the input files for the reducer. private Path[] getMapFiles(FileSystem fs, boolean isLocal) throws IOException { List<Path> fileList = new ArrayList<Path>(); if (isLocal) { // for local jobs for (int i = 0; i < numMaps; ++i) { fileList.add(mapOutputFile.getInputFile(i, getTaskID(), round)); } } else { // for non local jobs for (FileStatus filestatus : mapOutputFilesOnDisk) { fileList.add(filestatus.getPath()); } } return fileList.toArray(new Path[0]); } private class ReduceValuesIterator<KEY, VALUE> extends ValuesIterator<KEY, VALUE> { public ReduceValuesIterator(RawKeyValueIterator in, RawComparator<KEY> comparator, Class<KEY> keyClass, Class<VALUE> valClass, Configuration conf, Progressable reporter) throws IOException { super(in, comparator, keyClass, valClass, conf, reporter); } protected VALUE moveToNext() { return super.next(); } } private class SkippingReduceValuesIterator<KEY, VALUE> extends ReduceValuesIterator<KEY, VALUE> { private SkipRangeIterator skipIt; private TaskUmbilicalProtocol umbilical; private Counters.Counter skipGroupCounter; private Counters.Counter skipRecCounter; private long grpIndex = -1; private Class<KEY> keyClass; private Class<VALUE> valClass; private SequenceFile.Writer skipWriter; private boolean toWriteSkipRecs; private boolean hasNext; private TaskReporter reporter; public SkippingReduceValuesIterator(RawKeyValueIterator in, RawComparator<KEY> comparator, Class<KEY> keyClass, Class<VALUE> valClass, Configuration conf, TaskReporter reporter, TaskUmbilicalProtocol umbilical) throws IOException { super(in, comparator, keyClass, valClass, conf, reporter); this.umbilical = umbilical; this.skipGroupCounter = reporter.getCounter(Counter.REDUCE_SKIPPED_GROUPS); this.skipRecCounter = reporter.getCounter(Counter.REDUCE_SKIPPED_RECORDS); this.toWriteSkipRecs = toWriteSkipRecs() && SkipBadRecords.getSkipOutputPath(conf) != null; this.keyClass = keyClass; this.valClass = valClass; this.reporter = reporter; skipIt = getSkipRanges().skipRangeIterator(); mayBeSkip(); } void nextKey() throws IOException { super.nextKey(); mayBeSkip(); } public boolean more() { return super.more() && hasNext; } private void mayBeSkip() throws IOException { hasNext = skipIt.hasNext(); if (!hasNext) { LOG.warn("Further groups got skipped."); return; } grpIndex++; long nextGrpIndex = skipIt.next(); long skip = 0; long skipRec = 0; while (grpIndex < nextGrpIndex && super.more()) { while (hasNext()) { VALUE value = moveToNext(); if (toWriteSkipRecs) { writeSkippedRec(getKey(), value); } skipRec++; } super.nextKey(); grpIndex++; skip++; } // close the skip writer once all the ranges are skipped if (skip > 0 && skipIt.skippedAllRanges() && skipWriter != null) { skipWriter.close(); } skipGroupCounter.increment(skip); skipRecCounter.increment(skipRec); reportNextRecordRange(umbilical, grpIndex); } @SuppressWarnings("unchecked") private void writeSkippedRec(KEY key, VALUE value) throws IOException { if (skipWriter == null) { Path skipDir = SkipBadRecords.getSkipOutputPath(conf); Path skipFile = new Path(skipDir, getTaskID().toString()); skipWriter = SequenceFile.createWriter(skipFile.getFileSystem(conf), conf, skipFile, keyClass, valClass, CompressionType.BLOCK, reporter); } skipWriter.append(key, value); } } @Override @SuppressWarnings("unchecked") public void run(JobConf job, final TaskUmbilicalProtocol umbilical) throws IOException, InterruptedException, ClassNotFoundException { job = conf; if (isMapOrReduce()) { copyPhase = getProgress().addPhase("copy"); sortPhase = getProgress().addPhase("sort"); reducePhase = getProgress().addPhase("reduce"); } this.loadJobConf(); long start = System.currentTimeMillis(); this.umbilical = umbilical; job.setBoolean("mapred.skip.on", isSkipping()); // start thread that will handle communication with parent TaskReporter reporter = new TaskReporter(getProgress(), umbilical); reporter.startCommunicationThread(); boolean useNewApi = job.getUseNewReducer(); initialize(job, getJobID(), reporter, useNewApi); // check if it is a cleanupJobTask if (jobCleanup) { runJobCleanupTask(umbilical, reporter); return; } if (jobSetup) { runJobSetupTask(umbilical, reporter); return; } if (taskCleanup) { runTaskCleanupTask(umbilical, reporter); return; } loopCacheControl = ReflectionUtils.newInstance(conf.getLoopReduceCacheSwitch(), conf); loopCacheFilter = ReflectionUtils.newInstance(conf.getLoopReduceCacheFilter(), conf); // Initialize the codec codec = initCodec(); /** * HaLoop: recover from schedule log */ // if (this.getNodeFailure()) recoverMappersFromScheduleLog(); /** * HaLoop: initialize loopCacheControl */ if (job.isIterative() && loopCacheControl == null) { Class<? extends LoopReduceCacheSwitch> cacheControl = job.getLoopReduceCacheSwitch(); loopCacheControl = ReflectionUtils.newInstance(cacheControl, job); } /** * HaLoop: set up loop filter */ if (job.isIterative() && loopCacheFilter == null) { Class<? extends LoopReduceCacheFilter> cacheFilter = job.getLoopReduceCacheFilter(); loopCacheFilter = ReflectionUtils.newInstance(cacheFilter, job); } boolean isLocal = "local".equals(job.get("mapred.job.tracker", "local")); if (!isLocal) { reduceCopier = new ReduceCopier(umbilical, job, reporter); if (!reduceCopier.fetchOutputs()) { if (reduceCopier.mergeThrowable instanceof FSError) { throw (FSError) reduceCopier.mergeThrowable; } throw new IOException("Task: " + getTaskID() + " - The reduce copier failed", reduceCopier.mergeThrowable); } } System.out.println("recover reduce copy finished"); final FileSystem rfs = FileSystem.getLocal(job).getRaw(); RawKeyValueIterator rIter = isLocal ? Merger.merge(job, rfs, job.getMapOutputKeyClass(), job.getMapOutputValueClass(), codec, getMapFiles(rfs, true), !conf.getKeepFailedTaskFiles(), job.getInt("io.sort.factor", 100), new Path(getTaskID().toString()), job.getOutputKeyComparator(), reporter, spilledRecordsCounter, null) : reduceCopier.createKVIterator(job, rfs, reporter); // free up the data structures mapOutputFilesOnDisk.clear(); // setPhase(TaskStatus.Phase.REDUCE); // statusUpdate(umbilical); Class keyClass = job.getMapOutputKeyClass(); Class valueClass = job.getMapOutputValueClass(); RawComparator comparator = job.getOutputValueGroupingComparator(); System.out.println("run recover"); // if this is a caching iteration, do recovery if (loopCacheControl.isCacheWritten(conf, iteration, step)) runRecover(job, umbilical, reporter, rIter, comparator, keyClass, valueClass); long end = System.currentTimeMillis(); System.out.println("reducer recover running time overall: " + (end - start) + "ms"); } /** * default value for cache flag is true */ private int numValues = 0; /** * HaLoop reduce cache logic is mainly in this method * * @param <INKEY> * @param <INVALUE> * @param <OUTKEY> * @param <OUTVALUE> * @param job * @param umbilical * @param reporter * @param rIter * @param comparator * @param keyClass * @param valueClass * @throws IOException */ @SuppressWarnings("unchecked") private <INKEY, INVALUE, OUTKEY, OUTVALUE> void runRecover(JobConf job, TaskUmbilicalProtocol umbilical, final TaskReporter reporter, RawKeyValueIterator rIter, RawComparator<INKEY> comparator, Class<INKEY> keyClass, Class<INVALUE> valueClass) throws IOException { ReduceValuesIterator<INKEY, INVALUE> values = isSkipping() ? new SkippingReduceValuesIterator<INKEY, INVALUE>(rIter, comparator, keyClass, valueClass, job, reporter, umbilical) : new ReduceValuesIterator<INKEY, INVALUE>(rIter, job.getOutputValueGroupingComparator(), keyClass, valueClass, job, reporter); long writePos = 0; SerializationFactory serializationFactory = new SerializationFactory(job); Serializer<INVALUE> valSerializer = serializationFactory.getSerializer(valueClass); Serializer<INKEY> indexKeySerializer = serializationFactory.getSerializer(keyClass); Serializer<LongWritable> indexPositionSerializer = serializationFactory.getSerializer(LongWritable.class); Serializer<IntWritable> sizeSerializer = serializationFactory.getSerializer(IntWritable.class); FSDataOutputStream fileOutput = new FSDataOutputStream(null); FSDataOutputStream indexOutput = new FSDataOutputStream(null); if (job.isIterative() && loopCacheControl.isCacheWritten(job, iteration, step)) { Path filePath = mapOutputFile.getReduceCacheFileForWrite(getTaskID(), -1, round); System.out.println("Lian: The file path for " + getTaskID() + "--" + filePath); Path indexPath = mapOutputFile.getCacheIndexFileForWrite(getTaskID(), -1, round); FileSystem localFs = FileSystem.getLocal(conf); FileSystem lfs = ((LocalFileSystem) localFs).getRaw(); fileOutput = lfs.create(filePath); indexOutput = lfs.create(indexPath); valSerializer.open(fileOutput); indexKeySerializer.open(indexOutput); indexPositionSerializer.open(indexOutput); sizeSerializer.open(indexOutput); } DataOutputBuffer bb = new DataOutputBuffer(); DataInputBuffer ib = new DataInputBuffer(); Serializer<INVALUE> ssl = serializationFactory.getSerializer(valueClass); Deserializer<INVALUE> dsl = serializationFactory.getDeserializer(valueClass); ssl.open(bb); dsl.open(ib); long iterationStart = System.currentTimeMillis(); reduceTime = 0; long smallTime = 0; System.out.println("start recover reduce"); while (values.more()) { INKEY key; // Yingyi's code: build the cache; if (conf.isIterative() && loopCacheControl.isCacheWritten(job, iteration, step)) { // write cache, in a loop job key = values.getKey(); indexKeySerializer.serialize(key); indexPositionSerializer.serialize(new LongWritable(writePos)); numValues = 0; CacheWriteIterator<INKEY, INVALUE> valueIterator = new CacheWriteIterator<INKEY, INVALUE>(key, valSerializer, values); // just iterate, does not call reduce function while (valueIterator.hasNext()) { valueIterator.next(); } writePos = fileOutput.getPos(); values.nextKey(); } } if (conf.isIterative() == true && loopCacheControl.isCacheWritten(job, iteration, step)) { fileOutput.close(); indexOutput.close(); } System.out.println("recovery iteration " + round + ":" + reduceTime + "ms"); System.out.println("recovery branch time: " + smallTime); long iterationEnd = System.currentTimeMillis(); System.out.println("recovery total time: " + (iterationEnd - iterationStart) + "ms"); } private class CacheWriteIterator<K, T> extends ValuesIterator<K, T> { Serializer<T> serializer = null; ValuesIterator<K, T> values = null; int count = 0; K key = null; public CacheWriteIterator(K k, Serializer<T> ser, Iterator<T> vals) { key = k; serializer = ser; values = (ValuesIterator<K, T>) vals; } public T next() { numValues++; T value = values.next(); count++; try { if (loopCacheFilter.isCache(key, value, count)) serializer.serialize(value); return value; } catch (IOException e) { e.printStackTrace(); return null; } } public boolean more() { return values.more(); } public boolean hasNext() { if (serializer == null || values == null) return false; return values.hasNext(); } public void remove() { } } private long reduceTime = 0; static class NewTrackingRecordWriter<K, V> extends org.apache.hadoop.mapreduce.RecordWriter<K, V> { private final org.apache.hadoop.mapreduce.RecordWriter<K, V> real; private final org.apache.hadoop.mapreduce.Counter outputRecordCounter; NewTrackingRecordWriter(org.apache.hadoop.mapreduce.RecordWriter<K, V> real, org.apache.hadoop.mapreduce.Counter recordCounter) { this.real = real; this.outputRecordCounter = recordCounter; } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { real.close(context); } @Override public void write(K key, V value) throws IOException, InterruptedException { real.write(key, value); outputRecordCounter.increment(1); } } class ReduceCopier<K, V> implements MRConstants { /** Reference to the umbilical object */ private TaskUmbilicalProtocol umbilical; private final TaskReporter reporter; /** Reference to the task object */ /** Number of ms before timing out a copy */ private static final int STALLED_COPY_TIMEOUT = 3 * 60 * 1000; /** Max events to fetch in one go from the tasktracker */ private static final int MAX_EVENTS_TO_FETCH = 10000; /** * our reduce task instance */ private RecoverReducerTask reduceTask; /** * the list of map outputs currently being copied */ private List<MapOutputLocation> scheduledCopies; /** * the results of dispatched copy attempts */ private List<CopyResult> copyResults; /** * the number of outputs to copy in parallel */ private int numCopiers; /** * a number that is set to the max #fetches we'd schedule and then pause * the schduling */ private int maxInFlight; /** * the amount of time spent on fetching one map output before * considering it as failed and notifying the jobtracker about it. */ private int maxBackoff; /** * busy hosts from which copies are being backed off Map of host -> next * contact time */ private Map<String, Long> penaltyBox; /** * the set of unique hosts from which we are copying */ private Set<String> uniqueHosts; /** * A reference to the RamManager for writing the map outputs to. */ private ShuffleRamManager ramManager; /** * A reference to the local file system for writing the map outputs to. */ private FileSystem localFileSys; private FileSystem rfs; /** * Number of files to merge at a time */ private int ioSortFactor; /** * A reference to the throwable object (if merge throws an exception) */ private volatile Throwable mergeThrowable; /** * A flag to indicate when to exit localFS merge */ private volatile boolean exitLocalFSMerge = false; /** * A flag to indicate when to exit getMapEvents thread */ private volatile boolean exitGetMapEvents = false; /** * When we accumulate maxInMemOutputs number of files in ram, we * merge/spill */ private final int maxInMemOutputs; /** * Usage threshold for in-memory output accumulation. */ private final float maxInMemCopyPer; /** * Maximum memory usage of map outputs to merge from memory into the * reduce, in bytes. */ private final long maxInMemReduce; /** * The threads for fetching the files. */ private List<MapOutputCopier> copiers = null; /** * The object for metrics reporting. */ private ShuffleClientMetrics shuffleClientMetrics = null; /** * the minimum interval between tasktracker polls */ private static final long MIN_POLL_INTERVAL = 1000; /** * a list of map output locations for fetch retrials */ private List<MapOutputLocation> retryFetches = new ArrayList<MapOutputLocation>(); /** * The set of required map outputs */ private Set<TaskID> copiedMapOutputs = Collections.synchronizedSet(new TreeSet<TaskID>()); /** * The set of obsolete map taskids. */ private Set<TaskAttemptID> obsoleteMapIds = Collections.synchronizedSet(new TreeSet<TaskAttemptID>()); private Random random = null; /** * the max of all the map completion times */ private int maxMapRuntime; /** * Maximum number of fetch-retries per-map. */ private volatile int maxFetchRetriesPerMap; /** * Combiner runner, if a combiner is needed */ private CombinerRunner combinerRunner; /** * Resettable collector used for combine. */ private CombineOutputCollector combineCollector = null; /** * Maximum percent of failed fetch attempt before killing the reduce * task. */ private static final float MAX_ALLOWED_FAILED_FETCH_ATTEMPT_PERCENT = 0.5f; /** * Minimum percent of progress required to keep the reduce alive. */ private static final float MIN_REQUIRED_PROGRESS_PERCENT = 0.5f; /** * Maximum percent of shuffle execution time required to keep the * reducer alive. */ private static final float MAX_ALLOWED_STALL_TIME_PERCENT = 0.5f; /** * Minimum number of map fetch retries. */ private static final int MIN_FETCH_RETRIES_PER_MAP = 2; /** * Maximum no. of unique maps from which we failed to fetch map-outputs * even after {@link #maxFetchRetriesPerMap} retries; after this the * reduce task is failed. */ private int maxFailedUniqueFetches = 5; /** * The maps from which we fail to fetch map-outputs even after * {@link #maxFetchRetriesPerMap} retries. */ Set<TaskID> fetchFailedMaps = new TreeSet<TaskID>(); /** * A map of taskId -> no. of failed fetches */ Map<TaskAttemptID, Integer> mapTaskToFailedFetchesMap = new HashMap<TaskAttemptID, Integer>(); /** * Initial backoff interval (milliseconds) */ private static final int BACKOFF_INIT = 4000; /** * The interval for logging in the shuffle */ private static final int MIN_LOG_TIME = 60000; /** * List of in-memory map-outputs. */ private final List<MapOutput> mapOutputsFilesInMemory = Collections .synchronizedList(new LinkedList<MapOutput>()); /** * The map for (Hosts, List of MapIds from this Host) maintaining map * output locations */ private final Map<String, List<MapOutputLocation>> mapLocations = new ConcurrentHashMap<String, List<MapOutputLocation>>(); /** * This class contains the methods that should be used for * metrics-reporting the specific metrics for shuffle. This class * actually reports the metrics for the shuffle client (the ReduceTask), * and hence the name ShuffleClientMetrics. */ class ShuffleClientMetrics implements Updater { private MetricsRecord shuffleMetrics = null; private int numFailedFetches = 0; private int numSuccessFetches = 0; private long numBytes = 0; private int numThreadsBusy = 0; ShuffleClientMetrics(JobConf conf) { MetricsContext metricsContext = MetricsUtil.getContext("mapred"); this.shuffleMetrics = MetricsUtil.createRecord(metricsContext, "shuffleInput"); this.shuffleMetrics.setTag("user", conf.getUser()); this.shuffleMetrics.setTag("jobName", conf.getJobName()); this.shuffleMetrics.setTag("jobId", RecoverReducerTask.this.getJobID().toString()); this.shuffleMetrics.setTag("taskId", getTaskID().toString()); this.shuffleMetrics.setTag("sessionId", conf.getSessionId()); metricsContext.registerUpdater(this); } public synchronized void inputBytes(long numBytes) { this.numBytes += numBytes; } public synchronized void failedFetch() { ++numFailedFetches; } public synchronized void successFetch() { ++numSuccessFetches; } public synchronized void threadBusy() { ++numThreadsBusy; } public synchronized void threadFree() { --numThreadsBusy; } public void doUpdates(MetricsContext unused) { synchronized (this) { shuffleMetrics.incrMetric("shuffle_input_bytes", numBytes); shuffleMetrics.incrMetric("shuffle_failed_fetches", numFailedFetches); shuffleMetrics.incrMetric("shuffle_success_fetches", numSuccessFetches); if (numCopiers != 0) { shuffleMetrics.setMetric("shuffle_fetchers_busy_percent", 100 * ((float) numThreadsBusy / numCopiers)); } else { shuffleMetrics.setMetric("shuffle_fetchers_busy_percent", 0); } numBytes = 0; numSuccessFetches = 0; numFailedFetches = 0; } shuffleMetrics.update(); } } /** Represents the result of an attempt to copy a map output */ private class CopyResult { // the map output location against which a copy attempt was made private final MapOutputLocation loc; // the size of the file copied, -1 if the transfer failed private final long size; // a flag signifying whether a copy result is obsolete private static final int OBSOLETE = -2; CopyResult(MapOutputLocation loc, long size) { this.loc = loc; this.size = size; } public boolean getSuccess() { return size >= 0; } public boolean isObsolete() { return size == OBSOLETE; } public long getSize() { return size; } public String getHost() { return loc.getHost(); } public MapOutputLocation getLocation() { return loc; } } private int nextMapOutputCopierId = 0; /** * Abstraction to track a map-output. */ private class MapOutputLocation { TaskAttemptID taskAttemptId; TaskID taskId; String ttHost; URL taskOutput; public MapOutputLocation(TaskAttemptID taskAttemptId, String ttHost, URL taskOutput) { this.taskAttemptId = taskAttemptId; this.taskId = this.taskAttemptId.getTaskID(); this.ttHost = ttHost; this.taskOutput = taskOutput; } public TaskAttemptID getTaskAttemptId() { return taskAttemptId; } public TaskID getTaskId() { return taskId; } public String getHost() { return ttHost; } public URL getOutputLocation() { return taskOutput; } } /** Describes the output of a map; could either be on disk or in-memory. */ private class MapOutput { final TaskID mapId; final TaskAttemptID mapAttemptId; final Path file; final Configuration conf; byte[] data; final boolean inMemory; long compressedSize; public MapOutput(TaskID mapId, TaskAttemptID mapAttemptId, Configuration conf, Path file, long size) { this.mapId = mapId; this.mapAttemptId = mapAttemptId; this.conf = conf; this.file = file; this.compressedSize = size; this.data = null; this.inMemory = false; } public MapOutput(TaskID mapId, TaskAttemptID mapAttemptId, byte[] data, int compressedLength) { this.mapId = mapId; this.mapAttemptId = mapAttemptId; this.file = null; this.conf = null; this.data = data; this.compressedSize = compressedLength; this.inMemory = true; } public void discard() throws IOException { if (inMemory) { data = null; } else { FileSystem fs = file.getFileSystem(conf); fs.delete(file, true); } } } class ShuffleRamManager implements RamManager { /* * Maximum percentage of the in-memory limit that a single shuffle * can consume */ private static final float MAX_SINGLE_SHUFFLE_SEGMENT_FRACTION = 0.25f; /* * Maximum percentage of shuffle-threads which can be stalled * simultaneously after which a merge is triggered. */ private static final float MAX_STALLED_SHUFFLE_THREADS_FRACTION = 0.75f; private final int maxSize; private final int maxSingleShuffleLimit; private int size = 0; private Object dataAvailable = new Object(); private int fullSize = 0; private int numPendingRequests = 0; private int numRequiredMapOutputs = 0; private int numClosed = 0; private boolean closed = false; public ShuffleRamManager(Configuration conf) throws IOException { final float maxInMemCopyUse = conf.getFloat("mapred.job.shuffle.input.buffer.percent", 0.70f); if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) { throw new IOException("mapred.job.shuffle.input.buffer.percent" + maxInMemCopyUse); } maxSize = (int) Math.min(Runtime.getRuntime().maxMemory() * maxInMemCopyUse, Integer.MAX_VALUE); maxSingleShuffleLimit = (int) (maxSize * MAX_SINGLE_SHUFFLE_SEGMENT_FRACTION); LOG.info("ShuffleRamManager: MemoryLimit=" + maxSize + ", MaxSingleShuffleLimit=" + maxSingleShuffleLimit); } public synchronized boolean reserve(int requestedSize, InputStream in) throws InterruptedException { // Wait till the request can be fulfilled... while ((size + requestedSize) > maxSize) { // Close the input... if (in != null) { try { in.close(); } catch (IOException ie) { LOG.info("Failed to close connection with: " + ie); } finally { in = null; } } // Track pending requests synchronized (dataAvailable) { ++numPendingRequests; dataAvailable.notify(); } // Wait for memory to free up wait(); // Track pending requests synchronized (dataAvailable) { --numPendingRequests; } } size += requestedSize; return (in != null); } public synchronized void unreserve(int requestedSize) { size -= requestedSize; synchronized (dataAvailable) { fullSize -= requestedSize; --numClosed; } // Notify the threads blocked on RamManager.reserve notifyAll(); } public boolean waitForDataToMerge() throws InterruptedException { boolean done = false; synchronized (dataAvailable) { // Start in-memory merge if manager has been closed or... while (!closed && // In-memory threshold exceeded and at least two // segments // have been fetched (getPercentUsed() < maxInMemCopyPer || numClosed < 2) && // More than "mapred.inmem.merge.threshold" map // outputs // have been fetched into memory (maxInMemOutputs <= 0 || numClosed < maxInMemOutputs) && // More than MAX... threads are blocked on the // RamManager // or the blocked threads are the last map outputs // to be // fetched. If numRequiredMapOutputs is zero, either // setNumCopiedMapOutputs has not been called (no // map ouputs // have been fetched, so there is nothing to merge) // or the // last map outputs being transferred without // contention, so a merge would be premature. (numPendingRequests < numCopiers * MAX_STALLED_SHUFFLE_THREADS_FRACTION && (0 == numRequiredMapOutputs || numPendingRequests < numRequiredMapOutputs))) { dataAvailable.wait(); } done = closed; } return done; } public void closeInMemoryFile(int requestedSize) { synchronized (dataAvailable) { fullSize += requestedSize; ++numClosed; dataAvailable.notify(); } } public void setNumCopiedMapOutputs(int numRequiredMapOutputs) { synchronized (dataAvailable) { this.numRequiredMapOutputs = numRequiredMapOutputs; dataAvailable.notify(); } } public void close() { synchronized (dataAvailable) { closed = true; LOG.info("Closed ram manager"); dataAvailable.notify(); } } private float getPercentUsed() { return (float) fullSize / maxSize; } int getMemoryLimit() { return maxSize; } boolean canFitInMemory(long requestedSize) { return (requestedSize < Integer.MAX_VALUE && requestedSize < maxSingleShuffleLimit); } } /** Copies map outputs as they become available */ private class MapOutputCopier extends Thread { // basic/unit connection timeout (in milliseconds) private final static int UNIT_CONNECT_TIMEOUT = 30 * 1000; // default read timeout (in milliseconds) private final static int DEFAULT_READ_TIMEOUT = 3 * 60 * 1000; private MapOutputLocation currentLocation = null; private int id = nextMapOutputCopierId++; private Reporter reporter; // Decompression of map-outputs private CompressionCodec codec = null; private Decompressor decompressor = null; public MapOutputCopier(JobConf job, Reporter reporter) { setName("MapOutputCopier " + reduceTask.getTaskID() + "." + id); LOG.debug(getName() + " created"); this.reporter = reporter; if (job.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = job .getMapOutputCompressorClass(DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, job); decompressor = CodecPool.getDecompressor(codec); } } /** * Fail the current file that we are fetching * * @return were we currently fetching? */ public synchronized boolean fail() { if (currentLocation != null) { finish(-1); return true; } else { return false; } } /** * Get the current map output location. */ public synchronized MapOutputLocation getLocation() { return currentLocation; } private synchronized void start(MapOutputLocation loc) { currentLocation = loc; } private synchronized void finish(long size) { if (currentLocation != null) { LOG.debug(getName() + " finishing " + currentLocation + " =" + size); synchronized (copyResults) { copyResults.add(new CopyResult(currentLocation, size)); copyResults.notify(); } currentLocation = null; } } /** * Loop forever and fetch map outputs as they become available. The * thread exits when it is interrupted by {@link ReduceTaskRunner} */ @Override public void run() { while (true) { try { MapOutputLocation loc = null; long size = -1; synchronized (scheduledCopies) { while (scheduledCopies.isEmpty()) { scheduledCopies.wait(); } loc = scheduledCopies.remove(0); } try { shuffleClientMetrics.threadBusy(); start(loc); size = copyOutput(loc); shuffleClientMetrics.successFetch(); } catch (IOException e) { LOG.warn(reduceTask.getTaskID() + " copy failed: " + loc.getTaskAttemptId() + " from " + loc.getHost()); LOG.warn(StringUtils.stringifyException(e)); shuffleClientMetrics.failedFetch(); // Reset size = -1; } finally { shuffleClientMetrics.threadFree(); finish(size); } } catch (InterruptedException e) { break; // ALL DONE } catch (FSError e) { LOG.error("Task: " + reduceTask.getTaskID() + " - FSError: " + StringUtils.stringifyException(e)); try { umbilical.fsError(reduceTask.getTaskID(), e.getMessage()); } catch (IOException io) { LOG.error("Could not notify TT of FSError: " + StringUtils.stringifyException(io)); } } catch (Throwable th) { String msg = getTaskID() + " : Map output copy failure : " + StringUtils.stringifyException(th); reportFatalError(getTaskID(), th, msg); } } if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } /** * Copies a a map output from a remote host, via HTTP. * * @param currentLocation * the map output location to be copied * @return the path (fully qualified) of the copied file * @throws IOException * if there is an error copying the file * @throws InterruptedException * if the copier should give up */ private long copyOutput(MapOutputLocation loc) throws IOException, InterruptedException { // check if we still need to copy the output from this location if (copiedMapOutputs.contains(loc.getTaskId()) || obsoleteMapIds.contains(loc.getTaskAttemptId())) { return CopyResult.OBSOLETE; } // a temp filename. If this file gets created in ramfs, we're // fine, // else, we will check the localFS to find a suitable final // location // for this path TaskAttemptID reduceId = reduceTask.getTaskID(); Path filename = new Path("/" + TaskTracker.getIntermediateOutputDir(reduceId.getJobID().toString(), reduceId.toString()) + "/map_" + loc.getTaskId().getId() + ".out"); // Copy the map output to a temp file whose name is unique to // this attempt Path tmpMapOutput = new Path(filename + "-" + id); // Copy the map output MapOutput mapOutput = getMapOutput(loc, tmpMapOutput, reduceId.getTaskID().getId()); if (mapOutput == null) { throw new IOException( "Failed to fetch map-output for " + loc.getTaskAttemptId() + " from " + loc.getHost()); } // The size of the map-output long bytes = mapOutput.compressedSize; // lock the ReduceTask while we do the rename synchronized (RecoverReducerTask.this) { if (copiedMapOutputs.contains(loc.getTaskId())) { mapOutput.discard(); return CopyResult.OBSOLETE; } // Special case: discard empty map-outputs if (bytes == 0) { try { mapOutput.discard(); } catch (IOException ioe) { LOG.info("Couldn't discard output of " + loc.getTaskId()); } // Note that we successfully copied the map-output noteCopiedMapOutput(loc.getTaskId()); return bytes; } // Process map-output if (mapOutput.inMemory) { // Save it in the synchronized list of map-outputs mapOutputsFilesInMemory.add(mapOutput); } else { // Rename the temporary file to the final file; // ensure it is on the same partition tmpMapOutput = mapOutput.file; filename = new Path(tmpMapOutput.getParent(), filename.getName()); if (!localFileSys.rename(tmpMapOutput, filename)) { localFileSys.delete(tmpMapOutput, true); bytes = -1; throw new IOException( "Failed to rename map output " + tmpMapOutput + " to " + filename); } synchronized (mapOutputFilesOnDisk) { addToMapOutputFilesOnDisk(localFileSys.getFileStatus(filename)); } } // Note that we successfully copied the map-output noteCopiedMapOutput(loc.getTaskId()); } return bytes; } /** * Save the map taskid whose output we just copied. This function * assumes that it has been synchronized on ReduceTask.this. * * @param taskId * map taskid */ private void noteCopiedMapOutput(TaskID taskId) { copiedMapOutputs.add(taskId); ramManager.setNumCopiedMapOutputs(numMaps - copiedMapOutputs.size()); } /** * Get the map output into a local file (either in the inmemory fs * or on the local fs) from the remote server. We use the file * system so that we generate checksum files on the data. * * @param mapOutputLoc * map-output to be fetched * @param filename * the filename to write the data into * @param connectionTimeout * number of milliseconds for connection timeout * @param readTimeout * number of milliseconds for read timeout * @return the path of the file that got created * @throws IOException * when something goes wrong */ private MapOutput getMapOutput(MapOutputLocation mapOutputLoc, Path filename, int reduce) throws IOException, InterruptedException { // Connect URLConnection connection = mapOutputLoc.getOutputLocation().openConnection(); InputStream input = getInputStream(connection, STALLED_COPY_TIMEOUT, DEFAULT_READ_TIMEOUT); // Validate header from map output TaskAttemptID mapId = null; try { mapId = TaskAttemptID.forName(connection.getHeaderField(FROM_MAP_TASK)); } catch (IllegalArgumentException ia) { LOG.warn("Invalid map id ", ia); return null; } TaskAttemptID expectedMapId = mapOutputLoc.getTaskAttemptId(); if (!mapId.equals(expectedMapId)) { LOG.warn("data from wrong map:" + mapId + " arrived to reduce task " + reduce + ", where as expected map output should be from " + expectedMapId); return null; } long decompressedLength = Long.parseLong(connection.getHeaderField(RAW_MAP_OUTPUT_LENGTH)); long compressedLength = Long.parseLong(connection.getHeaderField(MAP_OUTPUT_LENGTH)); if (compressedLength < 0 || decompressedLength < 0) { LOG.warn(getName() + " invalid lengths in map output header: id: " + mapId + " compressed len: " + compressedLength + ", decompressed len: " + decompressedLength); return null; } int forReduce = (int) Integer.parseInt(connection.getHeaderField(FOR_REDUCE_TASK)); if (forReduce != reduce) { LOG.warn("data for the wrong reduce: " + forReduce + " with compressed len: " + compressedLength + ", decompressed len: " + decompressedLength + " arrived to reduce task " + reduce); return null; } LOG.info("header: " + mapId + ", compressed len: " + compressedLength + ", decompressed len: " + decompressedLength); // We will put a file in memory if it meets certain criteria: // 1. The size of the (decompressed) file should be less than // 25% of // the total inmem fs // 2. There is space available in the inmem fs // Check if this map-output can be saved in-memory boolean shuffleInMemory = ramManager.canFitInMemory(decompressedLength); // Shuffle MapOutput mapOutput = null; // close in-memory shuffling for comparison shuffleInMemory = false; if (shuffleInMemory) { LOG.info("Shuffling " + decompressedLength + " bytes (" + compressedLength + " raw bytes) " + "into RAM from " + mapOutputLoc.getTaskAttemptId()); mapOutput = shuffleInMemory(mapOutputLoc, connection, input, (int) decompressedLength, (int) compressedLength); } else { LOG.info("Shuffling " + decompressedLength + " bytes (" + compressedLength + " raw bytes) " + "into Local-FS from " + mapOutputLoc.getTaskAttemptId()); mapOutput = shuffleToDisk(mapOutputLoc, input, filename, compressedLength); } return mapOutput; } /** * The connection establishment is attempted multiple times and is * given up only on the last failure. Instead of connecting with a * timeout of X, we try connecting with a timeout of x < X but * multiple times. */ private InputStream getInputStream(URLConnection connection, int connectionTimeout, int readTimeout) throws IOException { int unit = 0; if (connectionTimeout < 0) { throw new IOException("Invalid timeout " + "[timeout = " + connectionTimeout + " ms]"); } else if (connectionTimeout > 0) { unit = (UNIT_CONNECT_TIMEOUT > connectionTimeout) ? connectionTimeout : UNIT_CONNECT_TIMEOUT; } // set the read timeout to the total timeout connection.setReadTimeout(readTimeout); // set the connect timeout to the unit-connect-timeout connection.setConnectTimeout(unit); while (true) { try { return connection.getInputStream(); } catch (IOException ioe) { // update the total remaining connect-timeout connectionTimeout -= unit; // throw an exception if we have waited for timeout // amount of time // note that the updated value if timeout is used here if (connectionTimeout == 0) { throw ioe; } // reset the connect timeout for the last try if (connectionTimeout < unit) { unit = connectionTimeout; // reset the connect time out for the final connect connection.setConnectTimeout(unit); } } } } private MapOutput shuffleInMemory(MapOutputLocation mapOutputLoc, URLConnection connection, InputStream input, int mapOutputLength, int compressedLength) throws IOException, InterruptedException { // Reserve ram for the map-output boolean createdNow = ramManager.reserve(mapOutputLength, input); // Reconnect if we need to if (!createdNow) { // Reconnect try { connection = mapOutputLoc.getOutputLocation().openConnection(); input = getInputStream(connection, STALLED_COPY_TIMEOUT, DEFAULT_READ_TIMEOUT); } catch (IOException ioe) { LOG.info("Failed reopen connection to fetch map-output from " + mapOutputLoc.getHost()); // Inform the ram-manager ramManager.closeInMemoryFile(mapOutputLength); ramManager.unreserve(mapOutputLength); throw ioe; } } IFileInputStream checksumIn = new IFileInputStream(input, compressedLength); input = checksumIn; // Are map-outputs compressed? if (codec != null) { decompressor.reset(); input = codec.createInputStream(input, decompressor); } // Copy map-output into an in-memory buffer byte[] shuffleData = new byte[mapOutputLength]; MapOutput mapOutput = new MapOutput(mapOutputLoc.getTaskId(), mapOutputLoc.getTaskAttemptId(), shuffleData, compressedLength); int bytesRead = 0; try { int n = input.read(shuffleData, 0, shuffleData.length); while (n > 0) { bytesRead += n; shuffleClientMetrics.inputBytes(n); // indicate we're making progress reporter.progress(); n = input.read(shuffleData, bytesRead, (shuffleData.length - bytesRead)); } LOG.info("Read " + bytesRead + " bytes from map-output for " + mapOutputLoc.getTaskAttemptId()); input.close(); } catch (IOException ioe) { LOG.info("Failed to shuffle from " + mapOutputLoc.getTaskAttemptId(), ioe); // Inform the ram-manager ramManager.closeInMemoryFile(mapOutputLength); ramManager.unreserve(mapOutputLength); // Discard the map-output try { mapOutput.discard(); } catch (IOException ignored) { LOG.info("Failed to discard map-output from " + mapOutputLoc.getTaskAttemptId(), ignored); } mapOutput = null; // Close the streams IOUtils.cleanup(LOG, input); // Re-throw throw ioe; } // Close the in-memory file // ramManager.closeInMemoryFile(mapOutputLength); LOG.info("so far so good"); ramManager.closeInMemoryFile(bytesRead); // Sanity check // if (bytesRead != mapOutputLength) { // // Inform the ram-manager // ramManager.unreserve(mapOutputLength); // // // Discard the map-output // try { // mapOutput.discard(); // } catch (IOException ignored) { // // IGNORED because we are cleaning up // LOG.info("Failed to discard map-output from " // + mapOutputLoc.getTaskAttemptId(), ignored); // } // mapOutput = null; // // throw new IOException("Incomplete map output received for " // + mapOutputLoc.getTaskAttemptId() + " from " // + mapOutputLoc.getOutputLocation() + " (" // + bytesRead + " instead of " + mapOutputLength // + ")"); // } // TODO: Remove this after a 'fix' for HADOOP-3647 if (mapOutputLength > 0) { DataInputBuffer dib = new DataInputBuffer(); dib.reset(shuffleData, 0, shuffleData.length); LOG.info( "Rec #1 from " + mapOutputLoc.getTaskAttemptId() + " -> (" + WritableUtils.readVInt(dib) + ", " + WritableUtils.readVInt(dib) + ") from " + mapOutputLoc.getHost()); } else { LOG.info("map output length: " + mapOutputLength); } return mapOutput; } private MapOutput shuffleToDisk(MapOutputLocation mapOutputLoc, InputStream input, Path filename, long mapOutputLength) throws IOException { // Find out a suitable location for the output on // local-filesystem Path localFilename = lDirAlloc.getLocalPathForWrite(filename.toUri().getPath(), mapOutputLength, conf); MapOutput mapOutput = new MapOutput(mapOutputLoc.getTaskId(), mapOutputLoc.getTaskAttemptId(), conf, localFileSys.makeQualified(localFilename), mapOutputLength); // Copy data to local-disk OutputStream output = null; long bytesRead = 0; try { output = rfs.create(localFilename); byte[] buf = new byte[64 * 1024]; int n = input.read(buf, 0, buf.length); while (n > 0) { bytesRead += n; shuffleClientMetrics.inputBytes(n); output.write(buf, 0, n); // indicate we're making progress reporter.progress(); n = input.read(buf, 0, buf.length); } LOG.info("Read " + bytesRead + " bytes from map-output for " + mapOutputLoc.getTaskAttemptId()); output.close(); input.close(); } catch (IOException ioe) { LOG.info("Failed to shuffle from " + mapOutputLoc.getTaskAttemptId(), ioe); // Discard the map-output try { mapOutput.discard(); } catch (IOException ignored) { LOG.info("Failed to discard map-output from " + mapOutputLoc.getTaskAttemptId(), ignored); } mapOutput = null; // Close the streams IOUtils.cleanup(LOG, input, output); // Re-throw throw ioe; } // Sanity check if (bytesRead != mapOutputLength) { try { mapOutput.discard(); } catch (Exception ioe) { // IGNORED because we are cleaning up LOG.info("Failed to discard map-output from " + mapOutputLoc.getTaskAttemptId(), ioe); } catch (Throwable t) { String msg = getTaskID() + " : Failed in shuffle to disk :" + StringUtils.stringifyException(t); reportFatalError(getTaskID(), t, msg); } mapOutput = null; throw new IOException("Incomplete map output received for " + mapOutputLoc.getTaskAttemptId() + " from " + mapOutputLoc.getOutputLocation() + " (" + bytesRead + " instead of " + mapOutputLength + ")"); } return mapOutput; } } // MapOutputCopier private void configureClasspath(JobConf conf) throws IOException { // get the task and the current classloader which will become the // parent Task task = RecoverReducerTask.this; ClassLoader parent = conf.getClassLoader(); // get the work directory which holds the elements we are // dynamically // adding to the classpath File workDir = new File(task.getJobFile()).getParentFile(); ArrayList<URL> urllist = new ArrayList<URL>(); // add the jars and directories to the classpath String jar = conf.getJar(); if (jar != null) { File jobCacheDir = new File(new Path(jar).getParent().toString()); File[] libs = new File(jobCacheDir, "lib").listFiles(); if (libs != null) { for (int i = 0; i < libs.length; i++) { urllist.add(libs[i].toURL()); } } urllist.add(new File(jobCacheDir, "classes").toURL()); urllist.add(jobCacheDir.toURL()); } urllist.add(workDir.toURL()); // create a new classloader with the old classloader as its parent // then set that classloader as the one used by the current jobconf URL[] urls = urllist.toArray(new URL[urllist.size()]); URLClassLoader loader = new URLClassLoader(urls, parent); conf.setClassLoader(loader); } public ReduceCopier(TaskUmbilicalProtocol umbilical, JobConf conf, TaskReporter reporter) throws ClassNotFoundException, IOException { configureClasspath(conf); this.reporter = reporter; this.shuffleClientMetrics = new ShuffleClientMetrics(conf); this.umbilical = umbilical; this.reduceTask = RecoverReducerTask.this; this.scheduledCopies = new ArrayList<MapOutputLocation>(100); this.copyResults = new ArrayList<CopyResult>(100); this.numCopiers = conf.getInt("mapred.reduce.parallel.copies", 5); this.maxInFlight = 4 * numCopiers; this.maxBackoff = conf.getInt("mapred.reduce.copy.backoff", 300); Counters.Counter combineInputCounter = reporter.getCounter(Task.Counter.COMBINE_INPUT_RECORDS); this.combinerRunner = CombinerRunner.create(conf, getTaskID(), combineInputCounter, reporter, null); if (combinerRunner != null) { combineCollector = new CombineOutputCollector(reduceCombineOutputCounter); } this.ioSortFactor = conf.getInt("io.sort.factor", 10); // the exponential backoff formula // backoff (t) = init * base^(t-1) // so for max retries we get // backoff(1) + .... + backoff(max_fetch_retries) ~ max // solving which we get // max_fetch_retries ~ log((max * (base - 1) / init) + 1) / // log(base) // for the default value of max = 300 (5min) we get // max_fetch_retries = 6 // the order is 4,8,16,32,64,128. sum of which is 252 sec = 4.2 min // optimizing for the base 2 this.maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP, getClosestPowerOf2((this.maxBackoff * 1000 / BACKOFF_INIT) + 1)); this.maxFailedUniqueFetches = Math.min(numMaps, this.maxFailedUniqueFetches); this.maxInMemOutputs = conf.getInt("mapred.inmem.merge.threshold", 1000); this.maxInMemCopyPer = conf.getFloat("mapred.job.shuffle.merge.percent", 0.66f); final float maxRedPer = conf.getFloat("mapred.job.reduce.input.buffer.percent", 0f); if (maxRedPer > 1.0 || maxRedPer < 0.0) { throw new IOException("mapred.job.reduce.input.buffer.percent" + maxRedPer); } this.maxInMemReduce = (int) Math.min(Runtime.getRuntime().maxMemory() * maxRedPer, Integer.MAX_VALUE); // Setup the RamManager ramManager = new ShuffleRamManager(conf); localFileSys = FileSystem.getLocal(conf); rfs = ((LocalFileSystem) localFileSys).getRaw(); // hosts -> next contact time this.penaltyBox = new LinkedHashMap<String, Long>(); // hostnames this.uniqueHosts = new HashSet<String>(); // Seed the random number generator with a reasonably globally // unique seed long randomSeed = System.nanoTime() + (long) Math.pow(this.reduceTask.getPartition(), (this.reduceTask.getPartition() % 10)); this.random = new Random(randomSeed); this.maxMapRuntime = 0; } private boolean busyEnough(int numInFlight) { return numInFlight > maxInFlight; } public boolean fetchOutputs() throws IOException { int totalFailures = 0; int numInFlight = 0, numCopied = 0; DecimalFormat mbpsFormat = new DecimalFormat("0.00"); final Progress copyPhase = reduceTask.getProgress().phase(); LocalFSMerger localFSMergerThread = null; InMemFSMergeThread inMemFSMergeThread = null; GetMapEventsThread getMapEventsThread = null; for (int i = 0; i < numMaps; i++) { copyPhase.addPhase(); // add sub-phase per file } copiers = new ArrayList<MapOutputCopier>(numCopiers); // start all the copying threads for (int i = 0; i < numCopiers; i++) { MapOutputCopier copier = new MapOutputCopier(conf, reporter); copiers.add(copier); copier.start(); } // start the on-disk-merge thread localFSMergerThread = new LocalFSMerger((LocalFileSystem) localFileSys); // start the in memory merger thread inMemFSMergeThread = new InMemFSMergeThread(); localFSMergerThread.start(); inMemFSMergeThread.start(); // start the map events thread getMapEventsThread = new GetMapEventsThread(); getMapEventsThread.start(); // start the clock for bandwidth measurement long startTime = System.currentTimeMillis(); long currentTime = startTime; long lastProgressTime = startTime; long lastOutputTime = 0; // loop until we get all required outputs while (copiedMapOutputs.size() < numMaps && mergeThrowable == null) { currentTime = System.currentTimeMillis(); boolean logNow = false; if (currentTime - lastOutputTime > MIN_LOG_TIME) { lastOutputTime = currentTime; logNow = true; } if (logNow) { LOG.info(reduceTask.getTaskID() + " Need another " + (numMaps - copiedMapOutputs.size()) + " map output(s) " + "where " + numInFlight + " is already in progress"); } // Put the hash entries for the failed fetches. Iterator<MapOutputLocation> locItr = retryFetches.iterator(); while (locItr.hasNext()) { MapOutputLocation loc = locItr.next(); List<MapOutputLocation> locList = mapLocations.get(loc.getHost()); // Check if the list exists. Map output location mapping is // cleared // once the jobtracker restarts and is rebuilt from scratch. // Note that map-output-location mapping will be recreated // and hence // we continue with the hope that we might find some // locations // from the rebuild map. if (locList != null) { // Add to the beginning of the list so that this map is // tried again before the others and we can hasten the // re-execution of this map should there be a problem locList.add(0, loc); } } if (retryFetches.size() > 0) { LOG.info(reduceTask.getTaskID() + ": " + "Got " + retryFetches.size() + " map-outputs from previous failures"); } // clear the "failed" fetches hashmap retryFetches.clear(); // now walk through the cache and schedule what we can int numScheduled = 0; int numDups = 0; synchronized (scheduledCopies) { // Randomize the map output locations to prevent // all reduce-tasks swamping the same tasktracker List<String> hostList = new ArrayList<String>(); hostList.addAll(mapLocations.keySet()); Collections.shuffle(hostList, this.random); Iterator<String> hostsItr = hostList.iterator(); while (hostsItr.hasNext()) { String host = hostsItr.next(); List<MapOutputLocation> knownOutputsByLoc = mapLocations.get(host); // Check if the list exists. Map output location mapping // is // cleared once the jobtracker restarts and is rebuilt // from // scratch. // Note that map-output-location mapping will be // recreated and // hence we continue with the hope that we might find // some // locations from the rebuild map and add then for // fetching. if (knownOutputsByLoc == null || knownOutputsByLoc.size() == 0) { continue; } // Identify duplicate hosts here if (uniqueHosts.contains(host)) { numDups += knownOutputsByLoc.size(); continue; } Long penaltyEnd = penaltyBox.get(host); boolean penalized = false; if (penaltyEnd != null) { if (currentTime < penaltyEnd.longValue()) { penalized = true; } else { penaltyBox.remove(host); } } if (penalized) continue; synchronized (knownOutputsByLoc) { locItr = knownOutputsByLoc.iterator(); while (locItr.hasNext()) { MapOutputLocation loc = locItr.next(); // Do not schedule fetches from OBSOLETE maps if (obsoleteMapIds.contains(loc.getTaskAttemptId())) { locItr.remove(); continue; } uniqueHosts.add(host); scheduledCopies.add(loc); locItr.remove(); // remove from knownOutputs numInFlight++; numScheduled++; break; // we have a map from this host } } } scheduledCopies.notifyAll(); } if (numScheduled > 0 || logNow) { LOG.info(reduceTask.getTaskID() + " Scheduled " + numScheduled + " outputs (" + penaltyBox.size() + " slow hosts and" + numDups + " dup hosts)"); } if (penaltyBox.size() > 0 && logNow) { LOG.info("Penalized(slow) Hosts: "); for (String host : penaltyBox.keySet()) { LOG.info(host + " Will be considered after: " + ((penaltyBox.get(host) - currentTime) / 1000) + " seconds."); } } // if we have no copies in flight and we can't schedule anything // new, just wait for a bit try { if (numInFlight == 0 && numScheduled == 0) { // we should indicate progress as we don't want TT to // think // we're stuck and kill us reporter.progress(); Thread.sleep(5000); } } catch (InterruptedException e) { } // IGNORE while (numInFlight > 0 && mergeThrowable == null) { LOG.debug(reduceTask.getTaskID() + " numInFlight = " + numInFlight); // the call to getCopyResult will either // 1) return immediately with a null or a valid CopyResult // object, // or // 2) if the numInFlight is above maxInFlight, return with a // CopyResult object after getting a notification from a // fetcher thread, // So, when getCopyResult returns null, we can be sure that // we aren't busy enough and we should go and get more // mapcompletion // events from the tasktracker CopyResult cr = getCopyResult(numInFlight); if (cr == null) { break; } if (cr.getSuccess()) { // a successful copy numCopied++; lastProgressTime = System.currentTimeMillis(); reduceShuffleBytes.increment(cr.getSize()); long secsSinceStart = (System.currentTimeMillis() - startTime) / 1000 + 1; float mbs = ((float) reduceShuffleBytes.getCounter()) / (1024 * 1024); float transferRate = mbs / secsSinceStart; copyPhase.startNextPhase(); copyPhase.setStatus("copy (" + numCopied + " of " + numMaps + " at " + mbpsFormat.format(transferRate) + " MB/s)"); // Note successful fetch for this mapId to invalidate // (possibly) old fetch-failures fetchFailedMaps.remove(cr.getLocation().getTaskId()); } else if (cr.isObsolete()) { // ignore // Note successful fetch for this mapId to invalidate // (possibly) old fetch-failures // fetchFailedMaps.remove(cr.getLocation().getTaskId()); LOG.info(reduceTask.getTaskID() + " Ignoring obsolete copy result for Map Task: " + cr.getLocation().getTaskAttemptId() + " from host: " + cr.getHost()); } else { retryFetches.add(cr.getLocation()); // note the failed-fetch TaskAttemptID mapTaskId = cr.getLocation().getTaskAttemptId(); TaskID mapId = cr.getLocation().getTaskId(); totalFailures++; Integer noFailedFetches = mapTaskToFailedFetchesMap.get(mapTaskId); noFailedFetches = (noFailedFetches == null) ? 1 : (noFailedFetches + 1); mapTaskToFailedFetchesMap.put(mapTaskId, noFailedFetches); LOG.info("Task " + getTaskID() + ": Failed fetch #" + noFailedFetches + " from " + mapTaskId); // did the fetch fail too many times? // using a hybrid technique for notifying the // jobtracker. // a. the first notification is sent after max-retries // b. subsequent notifications are sent after 2 retries. if ((noFailedFetches >= maxFetchRetriesPerMap) && ((noFailedFetches - maxFetchRetriesPerMap) % 2) == 0) { synchronized (RecoverReducerTask.this) { taskStatus.addFetchFailedMap(mapTaskId); LOG.info("Failed to fetch map-output from " + mapTaskId + " even after MAX_FETCH_RETRIES_PER_MAP retries... " + " reporting to the JobTracker"); } } // note unique failed-fetch maps if (noFailedFetches == maxFetchRetriesPerMap) { fetchFailedMaps.add(mapId); // did we have too many unique failed-fetch maps? // and did we fail on too many fetch attempts? // and did we progress enough // or did we wait for too long without any progress? // check if the reducer is healthy boolean reducerHealthy = (((float) totalFailures / (totalFailures + numCopied)) < MAX_ALLOWED_FAILED_FETCH_ATTEMPT_PERCENT); // check if the reducer has progressed enough boolean reducerProgressedEnough = (((float) numCopied / numMaps) >= MIN_REQUIRED_PROGRESS_PERCENT); // check if the reducer is stalled for a long time // duration for which the reducer is stalled int stallDuration = (int) (System.currentTimeMillis() - lastProgressTime); // duration for which the reducer ran with progress int shuffleProgressDuration = (int) (lastProgressTime - startTime); // min time the reducer should run without getting // killed int minShuffleRunDuration = (shuffleProgressDuration > maxMapRuntime) ? shuffleProgressDuration : maxMapRuntime; boolean reducerStalled = (((float) stallDuration / minShuffleRunDuration) >= MAX_ALLOWED_STALL_TIME_PERCENT); // kill if not healthy and has insufficient progress if ((fetchFailedMaps.size() >= maxFailedUniqueFetches || fetchFailedMaps.size() == (numMaps - copiedMapOutputs.size())) && !reducerHealthy && (!reducerProgressedEnough || reducerStalled)) { LOG.fatal("Shuffle failed with too many fetch failures " + "and insufficient progress!" + "Killing task " + getTaskID() + "."); umbilical.shuffleError(getTaskID(), "Exceeded MAX_FAILED_UNIQUE_FETCHES;" + " bailing-out."); } } // back off exponentially until num_retries <= // max_retries // back off by max_backoff/2 on subsequent failed // attempts currentTime = System.currentTimeMillis(); int currentBackOff = noFailedFetches <= maxFetchRetriesPerMap ? BACKOFF_INIT * (1 << (noFailedFetches - 1)) : (this.maxBackoff * 1000 / 2); penaltyBox.put(cr.getHost(), currentTime + currentBackOff); LOG.warn(reduceTask.getTaskID() + " adding host " + cr.getHost() + " to penalty box, next contact in " + (currentBackOff / 1000) + " seconds"); } uniqueHosts.remove(cr.getHost()); numInFlight--; } } // all done, inform the copiers to exit exitGetMapEvents = true; try { getMapEventsThread.join(); LOG.info("getMapsEventsThread joined."); } catch (InterruptedException ie) { LOG.info("getMapsEventsThread threw an exception: " + StringUtils.stringifyException(ie)); } synchronized (copiers) { synchronized (scheduledCopies) { for (MapOutputCopier copier : copiers) { copier.interrupt(); } copiers.clear(); } } // copiers are done, exit and notify the waiting merge threads synchronized (mapOutputFilesOnDisk) { exitLocalFSMerge = true; mapOutputFilesOnDisk.notify(); } ramManager.close(); // Do a merge of in-memory files (if there are any) if (mergeThrowable == null) { try { // Wait for the on-disk merge to complete localFSMergerThread.join(); LOG.info("Interleaved on-disk merge complete: " + mapOutputFilesOnDisk.size() + " files left."); // wait for an ongoing merge (if it is in flight) to // complete inMemFSMergeThread.join(); LOG.info("In-memory merge complete: " + mapOutputsFilesInMemory.size() + " files left."); } catch (InterruptedException ie) { LOG.warn(reduceTask.getTaskID() + " Final merge of the inmemory files threw an exception: " + StringUtils.stringifyException(ie)); // check if the last merge generated an error if (mergeThrowable != null) { mergeThrowable = ie; } return false; } } return mergeThrowable == null && copiedMapOutputs.size() == numMaps; } private long createInMemorySegments(List<Segment<K, V>> inMemorySegments, long leaveBytes) throws IOException { long totalSize = 0L; synchronized (mapOutputsFilesInMemory) { // fullSize could come from the RamManager, but files can be // closed but not yet present in mapOutputsFilesInMemory long fullSize = 0L; for (MapOutput mo : mapOutputsFilesInMemory) { fullSize += mo.data.length; } while (fullSize > leaveBytes) { MapOutput mo = mapOutputsFilesInMemory.remove(0); totalSize += mo.data.length; fullSize -= mo.data.length; Reader<K, V> reader = new InMemoryReader<K, V>(ramManager, mo.mapAttemptId, mo.data, 0, mo.data.length); Segment<K, V> segment = new Segment<K, V>(reader, true); inMemorySegments.add(segment); } } return totalSize; } /** * Create a RawKeyValueIterator from copied map outputs. All copying * threads have exited, so all of the map outputs are available either * in memory or on disk. We also know that no merges are in progress, so * synchronization is more lax, here. * * The iterator returned must satisfy the following constraints: 1. * Fewer than io.sort.factor files may be sources 2. No more than * maxInMemReduce bytes of map outputs may be resident in memory when * the reduce begins * * If we must perform an intermediate merge to satisfy (1), then we can * keep the excluded outputs from (2) in memory and include them in the * first merge pass. If not, then said outputs must be written to disk * first. */ @SuppressWarnings("unchecked") private RawKeyValueIterator createKVIterator(JobConf job, FileSystem fs, Reporter reporter) throws IOException { // merge config params Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); Class<V> valueClass = (Class<V>) job.getMapOutputValueClass(); boolean keepInputs = job.getKeepFailedTaskFiles(); final Path tmpDir = new Path(getTaskID().toString()); final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); // segments required to vacate memory List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>(); long inMemToDiskBytes = 0; if (mapOutputsFilesInMemory.size() > 0) { TaskID mapId = mapOutputsFilesInMemory.get(0).mapId; inMemToDiskBytes = createInMemorySegments(memDiskSegments, maxInMemReduce); final int numMemDiskSegments = memDiskSegments.size(); if (numMemDiskSegments > 0 && ioSortFactor > mapOutputFilesOnDisk.size()) { // must spill to disk, but can't retain in-mem for // intermediate merge final Path outputPath = mapOutputFile.getInputFileForWrite(mapId, reduceTask.getTaskID(), inMemToDiskBytes, round); final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null); final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null); try { Merger.writeFile(rIter, writer, reporter, job); addToMapOutputFilesOnDisk(fs.getFileStatus(outputPath)); } catch (Exception e) { if (null != outputPath) { fs.delete(outputPath, true); } throw new IOException("Final merge failed", e); } finally { if (null != writer) { writer.close(); } } LOG.info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit"); inMemToDiskBytes = 0; memDiskSegments.clear(); } else if (inMemToDiskBytes != 0) { LOG.info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge"); } } // segments on disk List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>(); long onDiskBytes = inMemToDiskBytes; Path[] onDisk = getMapFiles(fs, false); for (Path file : onDisk) { onDiskBytes += fs.getFileStatus(file).getLen(); diskSegments.add(new Segment<K, V>(job, fs, file, codec, keepInputs)); } LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk"); Collections.sort(diskSegments, new Comparator<Segment<K, V>>() { public int compare(Segment<K, V> o1, Segment<K, V> o2) { if (o1.getLength() == o2.getLength()) { return 0; } return o1.getLength() < o2.getLength() ? -1 : 1; } }); // build final list of segments from merged backed by disk + in-mem List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>(); long inMemBytes = createInMemorySegments(finalSegments, 0); LOG.info("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce"); if (0 != onDiskBytes) { final int numInMemSegments = memDiskSegments.size(); diskSegments.addAll(0, memDiskSegments); memDiskSegments.clear(); RawKeyValueIterator diskMerge = Merger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null); diskSegments.clear(); if (0 == finalSegments.size()) { return diskMerge; } finalSegments.add(new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true)); } return Merger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null); } class RawKVIteratorReader extends IFile.Reader<K, V> { private final RawKeyValueIterator kvIter; public RawKVIteratorReader(RawKeyValueIterator kvIter, long size) throws IOException { super(null, null, size, null, spilledRecordsCounter); this.kvIter = kvIter; } public boolean next(DataInputBuffer key, DataInputBuffer value) throws IOException { if (kvIter.next()) { final DataInputBuffer kb = kvIter.getKey(); final DataInputBuffer vb = kvIter.getValue(); final int kp = kb.getPosition(); final int klen = kb.getLength() - kp; key.reset(kb.getData(), kp, klen); final int vp = vb.getPosition(); final int vlen = vb.getLength() - vp; value.reset(vb.getData(), vp, vlen); bytesRead += klen + vlen; return true; } return false; } public long getPosition() throws IOException { return bytesRead; } public void close() throws IOException { kvIter.close(); } } private CopyResult getCopyResult(int numInFlight) { synchronized (copyResults) { while (copyResults.isEmpty()) { try { // The idea is that if we have scheduled enough, we can // wait until // we hear from one of the copiers. if (busyEnough(numInFlight)) { copyResults.wait(); } else { return null; } } catch (InterruptedException e) { } } return copyResults.remove(0); } } private void addToMapOutputFilesOnDisk(FileStatus status) { synchronized (mapOutputFilesOnDisk) { mapOutputFilesOnDisk.add(status); mapOutputFilesOnDisk.notify(); } } /** * Starts merging the local copy (on disk) of the map's output so that * most of the reducer's input is sorted i.e overlapping shuffle and * merge phases. */ private class LocalFSMerger extends Thread { private LocalFileSystem localFileSys; public LocalFSMerger(LocalFileSystem fs) { this.localFileSys = fs; setName("Thread for merging on-disk files"); setDaemon(true); } @SuppressWarnings("unchecked") public void run() { try { LOG.info(reduceTask.getTaskID() + " Thread started: " + getName()); while (!exitLocalFSMerge) { synchronized (mapOutputFilesOnDisk) { while (!exitLocalFSMerge && mapOutputFilesOnDisk.size() < (2 * ioSortFactor - 1)) { LOG.info(reduceTask.getTaskID() + " Thread waiting: " + getName()); mapOutputFilesOnDisk.wait(); } } if (exitLocalFSMerge) {// to avoid running one extra // time in the end break; } List<Path> mapFiles = new ArrayList<Path>(); long approxOutputSize = 0; int bytesPerSum = reduceTask.getConf().getInt("io.bytes.per.checksum", 512); LOG.info(reduceTask.getTaskID() + "We have " + mapOutputFilesOnDisk.size() + " map outputs on disk. " + "Triggering merge of " + ioSortFactor + " files"); // 1. Prepare the list of files to be merged. This list // is prepared // using a list of map output files on disk. Currently // we merge // io.sort.factor files into 1. synchronized (mapOutputFilesOnDisk) { for (int i = 0; i < ioSortFactor; ++i) { FileStatus filestatus = mapOutputFilesOnDisk.first(); mapOutputFilesOnDisk.remove(filestatus); mapFiles.add(filestatus.getPath()); approxOutputSize += filestatus.getLen(); } } // sanity check if (mapFiles.size() == 0) { return; } // add the checksum length approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum); // 2. Start the on-disk merge process Path outputPath = lDirAlloc .getLocalPathForWrite(mapFiles.get(0).toString(), approxOutputSize, conf) .suffix(".merged"); Writer writer = new Writer(conf, rfs, outputPath, conf.getMapOutputKeyClass(), conf.getMapOutputValueClass(), codec, null); RawKeyValueIterator iter = null; Path tmpDir = new Path(reduceTask.getTaskID().toString()); try { iter = Merger.merge(conf, rfs, conf.getMapOutputKeyClass(), conf.getMapOutputValueClass(), codec, mapFiles.toArray(new Path[mapFiles.size()]), true, ioSortFactor, tmpDir, conf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null); Merger.writeFile(iter, writer, reporter, conf); writer.close(); } catch (Exception e) { localFileSys.delete(outputPath, true); throw new IOException(StringUtils.stringifyException(e)); } synchronized (mapOutputFilesOnDisk) { addToMapOutputFilesOnDisk(localFileSys.getFileStatus(outputPath)); } LOG.info(reduceTask.getTaskID() + " Finished merging " + mapFiles.size() + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is " + outputPath + " of size " + localFileSys.getFileStatus(outputPath).getLen()); } } catch (Exception e) { LOG.warn(reduceTask.getTaskID() + " Merging of the local FS files threw an exception: " + StringUtils.stringifyException(e)); if (mergeThrowable == null) { mergeThrowable = e; } } catch (Throwable t) { String msg = getTaskID() + " : Failed to merge on the local FS" + StringUtils.stringifyException(t); reportFatalError(getTaskID(), t, msg); } } } private class InMemFSMergeThread extends Thread { public InMemFSMergeThread() { setName("Thread for merging in memory files"); setDaemon(true); } public void run() { LOG.info(reduceTask.getTaskID() + " Thread started: " + getName()); try { boolean exit = false; do { exit = ramManager.waitForDataToMerge(); if (!exit) { doInMemMerge(); } } while (!exit); } catch (Exception e) { LOG.warn(reduceTask.getTaskID() + " Merge of the inmemory files threw an exception: " + StringUtils.stringifyException(e)); ReduceCopier.this.mergeThrowable = e; } catch (Throwable t) { String msg = getTaskID() + " : Failed to merge in memory" + StringUtils.stringifyException(t); reportFatalError(getTaskID(), t, msg); } } @SuppressWarnings("unchecked") private void doInMemMerge() throws IOException { if (mapOutputsFilesInMemory.size() == 0) { return; } // name this output file same as the name of the first file that // is // there in the current list of inmem files (this is guaranteed // to // be absent on the disk currently. So we don't overwrite a // prev. // created spill). Also we need to create the output file now // since // it is not guaranteed that this file will be present after // merge // is called (we delete empty files as soon as we see them // in the merge method) // figure out the mapId TaskID mapId = mapOutputsFilesInMemory.get(0).mapId; List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapId, reduceTask.getTaskID(), mergeOutputSize, round); Writer writer = new Writer(conf, rfs, outputPath, conf.getMapOutputKeyClass(), conf.getMapOutputValueClass(), codec, null); RawKeyValueIterator rIter = null; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(conf, rfs, (Class<K>) conf.getMapOutputKeyClass(), (Class<V>) conf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceTask.getTaskID().toString()), conf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null); if (combinerRunner == null) { Merger.writeFile(rIter, writer, reporter, conf); } else { combineCollector.setWriter(writer); combinerRunner.combine(rIter, combineCollector); } writer.close(); LOG.info(reduceTask.getTaskID() + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFileSys.getFileStatus(outputPath).getLen()); } catch (Exception e) { // make sure that we delete the ondisk file that we created // earlier when we invoked cloneFileAttributes localFileSys.delete(outputPath, true); throw (IOException) new IOException("Intermediate merge failed").initCause(e); } // Note the output of the merge FileStatus status = localFileSys.getFileStatus(outputPath); synchronized (mapOutputFilesOnDisk) { addToMapOutputFilesOnDisk(status); } } } private class GetMapEventsThread extends Thread { private IntWritable fromEventId = new IntWritable(0); private static final long SLEEP_TIME = 1000; public GetMapEventsThread() { setName("Thread for polling Map Completion Events"); setDaemon(true); } @Override public void run() { LOG.info(reduceTask.getTaskID() + " Thread started: " + getName()); do { try { // LOG.info("get events from " + fromEventId.get()); int numNewMaps = getMapCompletionEvents(); if (numNewMaps > 0) { LOG.info(reduceTask.getTaskID() + ": " + "Got " + numNewMaps + " new map-outputs"); } Thread.sleep(SLEEP_TIME); } catch (InterruptedException e) { LOG.warn(reduceTask.getTaskID() + " GetMapEventsThread returning after an " + " interrupted exception"); return; } catch (Throwable t) { String msg = reduceTask.getTaskID() + " GetMapEventsThread Ignoring exception : " + StringUtils.stringifyException(t); reportFatalError(getTaskID(), t, msg); } } while (!exitGetMapEvents); LOG.info("GetMapEventsThread exiting"); } private boolean called = false; /** * Queries the {@link TaskTracker} for a set of map-completion * events from a given event ID. * * @throws IOException */ private int getMapCompletionEvents() throws IOException { int numNewMaps = 0; if (!called) { for (MapScheduleInfo msi : mapSchedules) { URI u = URI.create(msi.getHttpHost()); String host = u.getHost(); System.out.println("host " + host); TaskAttemptID taskId = msi.getTaskAttemptID(); /** * TODO has not considered maximum retry now */ URL mapOutputLocation = new URL(msi.getHttpHost() + "/mapOutput?job=" + taskId.getJobID() + "&map=" + taskId + "&reduce=" + getPartition() + "&iteration=" + round); System.out.println("recover copy address " + mapOutputLocation.toString()); List<MapOutputLocation> loc = mapLocations.get(host); if (loc == null) { loc = Collections.synchronizedList(new LinkedList<MapOutputLocation>()); mapLocations.put(host, loc); } loc.add(new MapOutputLocation(taskId, host, mapOutputLocation)); numNewMaps++; } /** * this method should only be called once */ called = true; } return numNewMaps; } } } /** * Return the exponent of the power of two closest to the given positive * value, or zero if value leq 0. This follows the observation that the msb * of a given value is also the closest power of two, unless the bit * following it is set. */ private static int getClosestPowerOf2(int value) { if (value <= 0) throw new IllegalArgumentException("Undefined for " + value); final int hob = Integer.highestOneBit(value); return Integer.numberOfTrailingZeros(hob) + (((hob >>> 1) & value) == 0 ? 0 : 1); } }