Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.tez.runtime.library.common.shuffle.orderedgrouped;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ChecksumFileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Progressable;
import org.apache.tez.common.TezUtilsInternal;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.common.counters.TezCounter;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.runtime.api.InputContext;
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration;
import org.apache.tez.runtime.library.common.ConfigUtils;
import org.apache.tez.runtime.library.common.Constants;
import org.apache.tez.runtime.library.common.InputAttemptIdentifier;
import org.apache.tez.runtime.library.common.combine.Combiner;
import org.apache.tez.runtime.library.common.sort.impl.IFile;
import org.apache.tez.runtime.library.common.sort.impl.TezMerger;
import org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator;
import org.apache.tez.runtime.library.common.sort.impl.IFile.Writer;
import org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment;
import org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles;
import org.apache.tez.runtime.library.hadoop.compat.NullProgressable;

 * Usage. Create instance. setInitialMemoryAvailable(long), configureAndStart()
@SuppressWarnings(value = { "rawtypes" })
public class MergeManager {

    private static final Log LOG = LogFactory.getLog(MergeManager.class);

    private final Configuration conf;
    private final FileSystem localFS;
    private final FileSystem rfs;
    private final LocalDirAllocator localDirAllocator;

    private final TezTaskOutputFiles mapOutputFile;
    private final Progressable nullProgressable = new NullProgressable();
    private final Combiner combiner;

    private final Set<MapOutput> inMemoryMergedMapOutputs = new TreeSet<MapOutput>(
            new MapOutput.MapOutputComparator());
    private final IntermediateMemoryToMemoryMerger memToMemMerger;

    private final Set<MapOutput> inMemoryMapOutputs = new TreeSet<MapOutput>(new MapOutput.MapOutputComparator());
    private final InMemoryMerger inMemoryMerger;

    final Set<FileChunk> onDiskMapOutputs = new TreeSet<FileChunk>();
    final OnDiskMerger onDiskMerger;

    private final long memoryLimit;
    final long postMergeMemLimit;
    private long usedMemory;
    private long commitMemory;
    private final int ioSortFactor;
    private final long maxSingleShuffleLimit;

    private final int memToMemMergeOutputsThreshold;
    private final long mergeThreshold;

    private final long initialMemoryAvailable;

    private final ExceptionReporter exceptionReporter;

    private final InputContext inputContext;

    private final TezCounter spilledRecordsCounter;

    private final TezCounter reduceCombineInputCounter;

    private final TezCounter mergedMapOutputsCounter;

    private final TezCounter numMemToDiskMerges;
    private final TezCounter numDiskToDiskMerges;
    private final TezCounter additionalBytesWritten;
    private final TezCounter additionalBytesRead;

    private final CompressionCodec codec;

    private volatile boolean finalMergeComplete = false;

    private final boolean ifileReadAhead;
    private final int ifileReadAheadLength;
    private final int ifileBufferSize;

    private AtomicInteger mergeFileSequenceId = new AtomicInteger(0);

     * Construct the MergeManager. Must call start before it becomes usable.
    public MergeManager(Configuration conf, FileSystem localFS, LocalDirAllocator localDirAllocator,
            InputContext inputContext, Combiner combiner, TezCounter spilledRecordsCounter,
            TezCounter reduceCombineInputCounter, TezCounter mergedMapOutputsCounter,
            ExceptionReporter exceptionReporter, long initialMemoryAvailable, CompressionCodec codec,
            boolean ifileReadAheadEnabled, int ifileReadAheadLength) {
        this.inputContext = inputContext;
        this.conf = conf;
        this.localDirAllocator = localDirAllocator;
        this.exceptionReporter = exceptionReporter;
        this.initialMemoryAvailable = initialMemoryAvailable;

        this.combiner = combiner;

        this.reduceCombineInputCounter = reduceCombineInputCounter;
        this.spilledRecordsCounter = spilledRecordsCounter;
        this.mergedMapOutputsCounter = mergedMapOutputsCounter;
        this.mapOutputFile = new TezTaskOutputFiles(conf, inputContext.getUniqueIdentifier());

        this.localFS = localFS;
        this.rfs = ((LocalFileSystem) localFS).getRaw();

        this.numDiskToDiskMerges = inputContext.getCounters().findCounter(TaskCounter.NUM_DISK_TO_DISK_MERGES);
        this.numMemToDiskMerges = inputContext.getCounters().findCounter(TaskCounter.NUM_MEM_TO_DISK_MERGES);
        this.additionalBytesWritten = inputContext.getCounters()
        this.additionalBytesRead = inputContext.getCounters().findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);

        this.codec = codec;
        this.ifileReadAhead = ifileReadAheadEnabled;
        if (this.ifileReadAhead) {
            this.ifileReadAheadLength = ifileReadAheadLength;
        } else {
            this.ifileReadAheadLength = 0;
        this.ifileBufferSize = conf.getInt("io.file.buffer.size",

        // Figure out initial memory req start
        final float maxInMemCopyUse = conf.getFloat(
        if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) {
            throw new IllegalArgumentException("Invalid value for "
                    + TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT + ": " + maxInMemCopyUse);

        // Allow unit tests to fix Runtime memory
        long memLimit = conf.getLong(Constants.TEZ_RUNTIME_TASK_MEMORY,
                (long) (inputContext.getTotalMemoryAvailableToTask() * maxInMemCopyUse));

        float maxRedPer = conf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT,
        if (maxRedPer > 1.0 || maxRedPer < 0.0) {
            throw new TezUncheckedException(
                    TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT + maxRedPer);

        long maxRedBuffer = (long) (inputContext.getTotalMemoryAvailableToTask() * maxRedPer);
        // Figure out initial memory req end

        if (this.initialMemoryAvailable < memLimit) {
            this.memoryLimit = this.initialMemoryAvailable;
        } else {
            this.memoryLimit = memLimit;

        if (this.initialMemoryAvailable < maxRedBuffer) {
            this.postMergeMemLimit = this.initialMemoryAvailable;
        } else {
            this.postMergeMemLimit = maxRedBuffer;
        }"InitialRequest: ShuffleMem=" + memLimit + ", postMergeMem=" + maxRedBuffer
                + ", RuntimeTotalAvailable=" + this.initialMemoryAvailable + ". Updated to: ShuffleMem="
                + this.memoryLimit + ", postMergeMem: " + this.postMergeMemLimit);

        this.ioSortFactor = conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR,

        final float singleShuffleMemoryLimitPercent = conf.getFloat(
        if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) {
            throw new IllegalArgumentException(
                    "Invalid value for " + TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT + ": "
                            + singleShuffleMemoryLimitPercent);

        //TODO: Cap it to MAX_VALUE until MapOutput starts supporting > 2 GB
        this.maxSingleShuffleLimit = (long) Math.min((memoryLimit * singleShuffleMemoryLimitPercent),
        this.memToMemMergeOutputsThreshold = conf
                .getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, ioSortFactor);
        this.mergeThreshold = (long) (this.memoryLimit
                * conf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT,
                        TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT_DEFAULT));"MergerManager: memoryLimit=" + memoryLimit + ", " + "maxSingleShuffleLimit="
                + maxSingleShuffleLimit + ", " + "mergeThreshold=" + mergeThreshold + ", " + "ioSortFactor="
                + ioSortFactor + ", " + "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);

        if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
            throw new RuntimeException("Invlaid configuration: "
                    + "maxSingleShuffleLimit should be less than mergeThreshold" + "maxSingleShuffleLimit: "
                    + this.maxSingleShuffleLimit + ", mergeThreshold: " + this.mergeThreshold);

        boolean allowMemToMemMerge = conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM,
        if (allowMemToMemMerge) {
            this.memToMemMerger = new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold);
        } else {
            this.memToMemMerger = null;

        this.inMemoryMerger = new InMemoryMerger(this);

        this.onDiskMerger = new OnDiskMerger(this);

    void configureAndStart() {
        if (this.memToMemMerger != null) {

     * Exposing this to get an initial memory ask without instantiating the object.
    static long getInitialMemoryRequirement(Configuration conf, long maxAvailableTaskMemory) {
        final float maxInMemCopyUse = conf.getFloat(
        if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) {
            throw new IllegalArgumentException("Invalid value for "
                    + TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT + ": " + maxInMemCopyUse);

        // Allow unit tests to fix Runtime memory
        long memLimit = conf.getLong(Constants.TEZ_RUNTIME_TASK_MEMORY,
                (long) (maxAvailableTaskMemory * maxInMemCopyUse));"Initial Shuffle Memory Required: " + memLimit + ", based on INPUT_BUFFER_factor: "
                + maxInMemCopyUse);

        float maxRedPer = conf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT,
        if (maxRedPer > 1.0 || maxRedPer < 0.0) {
            throw new TezUncheckedException(
                    TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT + maxRedPer);
        long maxRedBuffer = (long) (maxAvailableTaskMemory * maxRedPer);"Initial Memory required for final merged output: " + maxRedBuffer + ", using factor: "
                + maxRedPer);

        long reqMem = Math.max(maxRedBuffer, memLimit);
        return reqMem;

    public void waitForInMemoryMerge() throws InterruptedException {

    private boolean canShuffleToMemory(long requestedSize) {
        return (requestedSize < maxSingleShuffleLimit);

    public synchronized void waitForShuffleToMergeMemory() throws InterruptedException {
        long startTime = System.currentTimeMillis();
        while (usedMemory > memoryLimit) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Waited for " + (System.currentTimeMillis() - startTime) + " for memory to become"
                    + " available");

    final private MapOutput stallShuffle = MapOutput.createWaitMapOutput(null);

    public synchronized MapOutput reserve(InputAttemptIdentifier srcAttemptIdentifier, long requestedSize,
            long compressedLength, int fetcher) throws IOException {
        if (!canShuffleToMemory(requestedSize)) {
   + ": Shuffling to disk since " + requestedSize
                    + " is greater than maxSingleShuffleLimit (" + maxSingleShuffleLimit + ")");
            return MapOutput.createDiskMapOutput(srcAttemptIdentifier, this, compressedLength, conf, fetcher, true,

        // Stall shuffle if we are above the memory limit

        // It is possible that all threads could just be stalling and not make
        // progress at all. This could happen when:
        // requested size is causing the used memory to go above limit &&
        // requested size < singleShuffleLimit &&
        // current used size < mergeThreshold (merge will not get triggered)
        // To avoid this from happening, we allow exactly one thread to go past
        // the memory limit. We check (usedMemory > memoryLimit) and not
        // (usedMemory + requestedSize > memoryLimit). When this thread is done
        // fetching, this will automatically trigger a merge thereby unlocking
        // all the stalled threads

        if (usedMemory > memoryLimit) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(srcAttemptIdentifier + ": Stalling shuffle since usedMemory (" + usedMemory
                        + ") is greater than memoryLimit (" + memoryLimit + ")." + " CommitMemory is ("
                        + commitMemory + ")");
            return stallShuffle;

        // Allow the in-memory shuffle to progress
        if (LOG.isDebugEnabled()) {
            LOG.debug(srcAttemptIdentifier + ": Proceeding with shuffle since usedMemory (" + usedMemory
                    + ") is lesser than memoryLimit (" + memoryLimit + ")." + "CommitMemory is (" + commitMemory
                    + ")");
        return unconditionalReserve(srcAttemptIdentifier, requestedSize, true);

     * Unconditional Reserve is used by the Memory-to-Memory thread
    private synchronized MapOutput unconditionalReserve(InputAttemptIdentifier srcAttemptIdentifier,
            long requestedSize, boolean primaryMapOutput) throws IOException {
        usedMemory += requestedSize;
        return MapOutput.createMemoryMapOutput(srcAttemptIdentifier, this, (int) requestedSize, primaryMapOutput);

    synchronized void unreserve(long size) {
        commitMemory -= size;
        usedMemory -= size;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Notifying unreserve : commitMemory=" + commitMemory + ", usedMemory=" + usedMemory
                    + ", mergeThreshold=" + mergeThreshold);

    public synchronized void closeInMemoryFile(MapOutput mapOutput) {
        inMemoryMapOutputs.add(mapOutput);"closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
                + ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size() + ", commitMemory -> "
                + commitMemory + ", usedMemory ->" + usedMemory);

        commitMemory += mapOutput.getSize();

        if (commitMemory >= mergeThreshold) {

        // This should likely run a Combiner.
        if (memToMemMerger != null) {
            synchronized (memToMemMerger) {
                if (!memToMemMerger.isInProgress() && inMemoryMapOutputs.size() >= memToMemMergeOutputsThreshold) {

    private void startMemToDiskMerge() {
        synchronized (inMemoryMerger) {
            if (!inMemoryMerger.isInProgress()) {
      "Starting inMemoryMerger's merge since commitMemory=" + commitMemory + " > mergeThreshold="
                        + mergeThreshold + ". Current usedMemory=" + usedMemory);

    public synchronized void closeInMemoryMergedFile(MapOutput mapOutput) {
        inMemoryMergedMapOutputs.add(mapOutput);"closeInMemoryMergedFile -> size: " + mapOutput.getSize() + ", inMemoryMergedMapOutputs.size() -> "
                + inMemoryMergedMapOutputs.size());

    public synchronized void closeOnDiskFile(FileChunk file) {

        synchronized (onDiskMerger) {
            if (!onDiskMerger.isInProgress() && onDiskMapOutputs.size() >= (2 * ioSortFactor - 1)) {

     * Should <b>only</b> be used after the Shuffle phaze is complete, otherwise can
     * return an invalid state since a merge may not be in progress dur to
     * inadequate inputs
     * @return true if the merge process is complete, otherwise false
    public boolean isMergeComplete() {
        return finalMergeComplete;

    public TezRawKeyValueIterator close() throws Throwable {
        // Wait for on-going merges to complete
        if (memToMemMerger != null) {

        List<MapOutput> memory = new ArrayList<MapOutput>(inMemoryMergedMapOutputs);
        List<FileChunk> disk = new ArrayList<FileChunk>(onDiskMapOutputs);
        TezRawKeyValueIterator kvIter = finalMerge(conf, rfs, memory, disk);
        this.finalMergeComplete = true;
        return kvIter;

    void runCombineProcessor(TezRawKeyValueIterator kvIter, Writer writer)
            throws IOException, InterruptedException {
        combiner.combine(kvIter, writer);

     * Merges multiple in-memory segment to another in-memory segment
    private class IntermediateMemoryToMemoryMerger extends MergeThread<MapOutput> {

        public IntermediateMemoryToMemoryMerger(MergeManager manager, int mergeFactor) {
            super(manager, mergeFactor, exceptionReporter);
            setName("MemToMemMerger [" + TezUtilsInternal.cleanVertexName(inputContext.getSourceVertexName())
                    + "]");

        public void merge(List<MapOutput> inputs) throws IOException {
            if (inputs == null || inputs.size() == 0) {

            InputAttemptIdentifier dummyMapId = inputs.get(0).getAttemptIdentifier();
            List<Segment> inMemorySegments = new ArrayList<Segment>();
            long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0);
            int noInMemorySegments = inMemorySegments.size();

            MapOutput mergedMapOutputs = unconditionalReserve(dummyMapId, mergeOutputSize, false);

            Writer writer = new InMemoryWriter(mergedMapOutputs.getArrayStream());

  "Initiating Memory-to-Memory merge with " + noInMemorySegments + " segments of total-size: "
                    + mergeOutputSize);

            // Nothing will be materialized to disk because the sort factor is being
            // set to the number of in memory segments.
            // TODO Is this doing any combination ?
            TezRawKeyValueIterator rIter = TezMerger.merge(conf, rfs,
                    ConfigUtils.getIntermediateInputValueClass(conf), inMemorySegments, inMemorySegments.size(),
                    new Path(inputContext.getUniqueIdentifier()),
                    (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(conf), nullProgressable, null,
                    null, null, null);
            TezMerger.writeFile(rIter, writer, nullProgressable,

   + " Memory-to-Memory merge of the " + noInMemorySegments
                    + " files in-memory complete.");

            // Note the output of the merge

     * Merges multiple in-memory segment to a disk segment
    private class InMemoryMerger extends MergeThread<MapOutput> {

        public InMemoryMerger(MergeManager manager) {
            super(manager, Integer.MAX_VALUE, exceptionReporter);
            setName("MemtoDiskMerger [" + TezUtilsInternal.cleanVertexName(inputContext.getSourceVertexName())
                    + "]");

        public void merge(List<MapOutput> inputs) throws IOException, InterruptedException {
            if (inputs == null || inputs.size() == 0) {


            //name this output file same as the name of the first file that is 
            //there in the current list of inmem files (this is guaranteed to
            //be absent on the disk currently. So we don't overwrite a prev. 
            //created spill). Also we need to create the output file now since
            //it is not guaranteed that this file will be present after merge
            //is called (we delete empty files as soon as we see them
            //in the merge method)

            //figure out the mapId 
            InputAttemptIdentifier srcTaskIdentifier = inputs.get(0).getAttemptIdentifier();

            List<Segment> inMemorySegments = new ArrayList<Segment>();
            long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0);
            int noInMemorySegments = inMemorySegments.size();

            // TODO Maybe track serialized vs deserialized bytes.

            // All disk writes done by this merge are overhead - due to the lac of
            // adequate memory to keep all segments in memory.
            Path outputPath = mapOutputFile
                    .getInputFileForWrite(srcTaskIdentifier.getInputIdentifier().getInputIndex(), mergeOutputSize)

            Writer writer = null;
            long outFileLen = 0;
            try {
                writer = new Writer(conf, rfs, outputPath, (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                        (Class) ConfigUtils.getIntermediateInputValueClass(conf), codec, null, null);

                TezRawKeyValueIterator rIter = null;
      "Initiating in-memory merge with " + noInMemorySegments + " segments...");

                // Nothing actually materialized to disk - controlled by setting sort-factor to #segments.
                rIter = TezMerger.merge(conf, rfs, (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                        (Class) ConfigUtils.getIntermediateInputValueClass(conf), inMemorySegments,
                        inMemorySegments.size(), new Path(inputContext.getUniqueIdentifier()),
                        (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(conf), nullProgressable,
                        spilledRecordsCounter, null, additionalBytesRead, null);
                // spilledRecordsCounter is tracking the number of keys that will be
                // read from each of the segments being merged - which is essentially
                // what will be written to disk.

                if (null == combiner) {
                    TezMerger.writeFile(rIter, writer, nullProgressable,
                } else {
                    // TODO Counters for Combine
                    runCombineProcessor(rIter, writer);
                writer = null;

                outFileLen = localFS.getFileStatus(outputPath).getLen();
       + " Merge of the " + noInMemorySegments
                        + " files in-memory complete." + " Local file is " + outputPath + " of size " + outFileLen);
            } catch (IOException e) {
                //make sure that we delete the ondisk file that we created 
                //earlier when we invoked cloneFileAttributes
                localFS.delete(outputPath, true);
                throw e;
            } finally {
                if (writer != null) {

            // Note the output of the merge
            closeOnDiskFile(new FileChunk(outputPath, 0, outFileLen));


     * Merges multiple on-disk segments
    class OnDiskMerger extends MergeThread<FileChunk> {

        public OnDiskMerger(MergeManager manager) {
            super(manager, ioSortFactor, exceptionReporter);
            setName("DiskToDiskMerger [" + TezUtilsInternal.cleanVertexName(inputContext.getSourceVertexName())
                    + "]");

        public void merge(List<FileChunk> inputs) throws IOException {
            // sanity check
            if (inputs == null || inputs.isEmpty()) {
      "No ondisk files to merge...");

            long approxOutputSize = 0;
            int bytesPerSum = conf.getInt("io.bytes.per.checksum", 512);

  "OnDiskMerger: We have  " + inputs.size() + " map outputs on disk. Triggering merge...");

            List<Segment> inputSegments = new ArrayList<Segment>(inputs.size());

            // 1. Prepare the list of files to be merged.
            for (FileChunk fileChunk : inputs) {
                final long offset = fileChunk.getOffset();
                final long size = fileChunk.getLength();
                final boolean preserve = fileChunk.isLocalFile();
                final Path file = fileChunk.getPath();
                approxOutputSize += size;
                Segment segment = new Segment(rfs, file, offset, size, codec, ifileReadAhead, ifileReadAheadLength,
                        ifileBufferSize, preserve);

            // add the checksum length
            approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum);

            // 2. Start the on-disk merge process
            FileChunk file0 = inputs.get(0);
            String namePart;
            if (file0.isLocalFile()) {
                // This is setup the same way a type DISK MapOutput is setup when fetching.
                namePart = mapOutputFile

            } else {
                namePart = file0.getPath().getName().toString();

            // namePart includes the suffix of the file. We need to remove it.
            namePart = FilenameUtils.removeExtension(namePart);
            Path outputPath = localDirAllocator.getLocalPathForWrite(namePart, approxOutputSize, conf);
            outputPath = outputPath.suffix(Constants.MERGED_OUTPUT_PREFIX + mergeFileSequenceId.getAndIncrement());

            Writer writer = new Writer(conf, rfs, outputPath,
                    (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                    (Class) ConfigUtils.getIntermediateInputValueClass(conf), codec, null, null);
            Path tmpDir = new Path(inputContext.getUniqueIdentifier());
            try {
                TezRawKeyValueIterator iter = TezMerger.merge(conf, rfs,
                        (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                        (Class) ConfigUtils.getIntermediateInputValueClass(conf), inputSegments, ioSortFactor,
                        tmpDir, (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(conf),
                        nullProgressable, true, spilledRecordsCounter, null, mergedMapOutputsCounter, null);

                // TODO Maybe differentiate between data written because of Merges and
                // the finalMerge (i.e. final mem available may be different from
                // initial merge mem)
                TezMerger.writeFile(iter, writer, nullProgressable,
            } catch (IOException e) {
                localFS.delete(outputPath, true);
                throw e;

            final long outputLen = localFS.getFileStatus(outputPath).getLen();
            closeOnDiskFile(new FileChunk(outputPath, 0, outputLen));

   + " Finished merging " + inputs.size()
                    + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is "
                    + outputPath + " of size " + outputLen);

    private long createInMemorySegments(List<MapOutput> inMemoryMapOutputs, List<Segment> inMemorySegments,
            long leaveBytes) throws IOException {
        long totalSize = 0L;
        // We could use fullSize could come from the RamManager, but files can be
        // closed but not yet present in inMemoryMapOutputs
        long fullSize = 0L;
        for (MapOutput mo : inMemoryMapOutputs) {
            fullSize += mo.getMemory().length;
        while (fullSize > leaveBytes) {
            MapOutput mo = inMemoryMapOutputs.remove(0);
            byte[] data = mo.getMemory();
            long size = data.length;
            totalSize += size;
            fullSize -= size;
            IFile.Reader reader = new InMemoryReader(MergeManager.this, mo.getAttemptIdentifier(), data, 0,
                    (int) size);
                    .add(new Segment(reader, true, (mo.isPrimaryMapOutput() ? mergedMapOutputsCounter : null)));
        return totalSize;

    class RawKVIteratorReader extends IFile.Reader {

        private final TezRawKeyValueIterator kvIter;

        public RawKVIteratorReader(TezRawKeyValueIterator kvIter, long size) throws IOException {
            super(null, size, null, spilledRecordsCounter, null, ifileReadAhead, ifileReadAheadLength,
            this.kvIter = kvIter;

        public KeyState readRawKey(DataInputBuffer key) throws IOException {
            if ( {
                final DataInputBuffer kb = kvIter.getKey();
                final int kp = kb.getPosition();
                final int klen = kb.getLength() - kp;
                key.reset(kb.getData(), kp, klen);
                bytesRead += klen;
                return KeyState.NEW_KEY;
            return KeyState.NO_KEY;

        public void nextRawValue(DataInputBuffer value) throws IOException {
            final DataInputBuffer vb = kvIter.getValue();
            final int vp = vb.getPosition();
            final int vlen = vb.getLength() - vp;
            value.reset(vb.getData(), vp, vlen);
            bytesRead += vlen;

        public long getPosition() throws IOException {
            return bytesRead;

        public void close() throws IOException {

    private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs,
            List<FileChunk> onDiskMapOutputs) throws IOException {"finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and "
                + onDiskMapOutputs.size() + " on-disk map-outputs");

        // merge config params
        Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
        Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
        final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
        final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);

        // segments required to vacate memory
        List<Segment> memDiskSegments = new ArrayList<Segment>();
        long inMemToDiskBytes = 0;
        boolean mergePhaseFinished = false;
        if (inMemoryMapOutputs.size() > 0) {
            int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier().getInputIndex();
            inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, this.postMergeMemLimit);
            final int numMemDiskSegments = memDiskSegments.size();
            if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {

                // If we reach here, it implies that we have less than io.sort.factor
                // disk segments and this will be incremented by 1 (result of the 
                // memory segments merge). Since this total would still be 
                // <= io.sort.factor, we will not do any more intermediate merges,
                // the merge of all these disk segments would be directly fed to the
                // reduce method

                mergePhaseFinished = true;
                // must spill to disk, but can't retain in-mem for intermediate merge
                final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, inMemToDiskBytes)
                final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments,
                        numMemDiskSegments, tmpDir, comparator, nullProgressable, spilledRecordsCounter, null,
                        additionalBytesRead, null);
                final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null, null);
                try {
                    TezMerger.writeFile(rIter, writer, nullProgressable,
                } catch (IOException e) {
                    if (null != outputPath) {
                        try {
                            fs.delete(outputPath, true);
                        } catch (IOException ie) {
                            // NOTHING
                    throw e;
                } finally {
                    if (null != writer) {

                final FileStatus fStatus = localFS.getFileStatus(outputPath);
                // add to list of final disk outputs.
                onDiskMapOutputs.add(new FileChunk(outputPath, 0, fStatus.getLen()));

      "Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes
                        + " bytes to disk to satisfy " + "reduce memory limit");
                inMemToDiskBytes = 0;
            } else if (inMemToDiskBytes != 0) {
      "Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes
                        + " bytes in memory for " + "intermediate, on-disk merge");

        // segments on disk
        List<Segment> diskSegments = new ArrayList<Segment>();
        long onDiskBytes = inMemToDiskBytes;
        FileChunk[] onDisk = onDiskMapOutputs.toArray(new FileChunk[onDiskMapOutputs.size()]);
        for (FileChunk fileChunk : onDisk) {
            final long fileLength = fileChunk.getLength();
            onDiskBytes += fileLength;
            LOG.debug("Disk file: " + fileChunk.getPath() + " Length is " + fileLength);

            final Path file = fileChunk.getPath();
            TezCounter counter = file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null
                    : mergedMapOutputsCounter;

            final long fileOffset = fileChunk.getOffset();
            final boolean preserve = fileChunk.isLocalFile();
            diskSegments.add(new Segment(fs, file, fileOffset, fileLength, codec, ifileReadAhead,
                    ifileReadAheadLength, ifileBufferSize, preserve, counter));
        }"Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
        Collections.sort(diskSegments, new Comparator<Segment>() {
            public int compare(Segment o1, Segment o2) {
                if (o1.getLength() == o2.getLength()) {
                    return 0;
                return o1.getLength() < o2.getLength() ? -1 : 1;

        // build final list of segments from merged backed by disk + in-mem
        List<Segment> finalSegments = new ArrayList<Segment>();
        long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);"Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
        if (0 != onDiskBytes) {
            final int numInMemSegments = memDiskSegments.size();
            diskSegments.addAll(0, memDiskSegments);
            TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, codec, diskSegments,
                    ioSortFactor, numInMemSegments, tmpDir, comparator, nullProgressable, false,
                    spilledRecordsCounter, null, additionalBytesRead, null);
            if (0 == finalSegments.size()) {
                return diskMerge;
            finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), true));
        // This is doing nothing but creating an iterator over the segments.
        return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir,
                comparator, nullProgressable, spilledRecordsCounter, null, additionalBytesRead, null);