org.apache.tez.runtime.library.common.shuffle.impl.MergeManager.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tez.runtime.library.common.shuffle.impl.MergeManager.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tez.runtime.library.common.shuffle.impl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ChecksumFileSystem;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tez.common.TezJobConfig;
import org.apache.tez.common.counters.TezCounter;
import org.apache.tez.runtime.api.TezInputContext;
import org.apache.tez.runtime.library.common.ConfigUtils;
import org.apache.tez.runtime.library.common.Constants;
import org.apache.tez.runtime.library.common.InputAttemptIdentifier;
import org.apache.tez.runtime.library.common.combine.Combiner;
import org.apache.tez.runtime.library.common.sort.impl.IFile;
import org.apache.tez.runtime.library.common.sort.impl.TezMerger;
import org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator;
import org.apache.tez.runtime.library.common.sort.impl.IFile.Writer;
import org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment;
import org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles;
import org.apache.tez.runtime.library.hadoop.compat.NullProgressable;

@InterfaceAudience.Private
@InterfaceStability.Unstable
@SuppressWarnings(value = { "rawtypes" })
public class MergeManager {

    private static final Log LOG = LogFactory.getLog(MergeManager.class);

    private final Configuration conf;
    private final FileSystem localFS;
    private final FileSystem rfs;
    private final LocalDirAllocator localDirAllocator;

    private final TezTaskOutputFiles mapOutputFile;
    private final Progressable nullProgressable = new NullProgressable();
    private final Combiner combiner;

    Set<MapOutput> inMemoryMergedMapOutputs = new TreeSet<MapOutput>(new MapOutput.MapOutputComparator());
    private final IntermediateMemoryToMemoryMerger memToMemMerger;

    Set<MapOutput> inMemoryMapOutputs = new TreeSet<MapOutput>(new MapOutput.MapOutputComparator());
    private final InMemoryMerger inMemoryMerger;

    Set<Path> onDiskMapOutputs = new TreeSet<Path>();
    private final OnDiskMerger onDiskMerger;

    private final long memoryLimit;
    private long usedMemory;
    private long commitMemory;
    private final long maxSingleShuffleLimit;

    private final int memToMemMergeOutputsThreshold;
    private final long mergeThreshold;

    private final int ioSortFactor;

    private final ExceptionReporter exceptionReporter;

    private final TezInputContext inputContext;

    private final TezCounter spilledRecordsCounter;

    private final TezCounter reduceCombineInputCounter;

    private final TezCounter mergedMapOutputsCounter;

    private final CompressionCodec codec;

    private volatile boolean finalMergeComplete = false;

    private final boolean ifileReadAhead;
    private final int ifileReadAheadLength;
    private final int ifileBufferSize;

    public MergeManager(Configuration conf, FileSystem localFS, LocalDirAllocator localDirAllocator,
            TezInputContext inputContext, Combiner combiner, TezCounter spilledRecordsCounter,
            TezCounter reduceCombineInputCounter, TezCounter mergedMapOutputsCounter,
            ExceptionReporter exceptionReporter) {
        this.inputContext = inputContext;
        this.conf = conf;
        this.localDirAllocator = localDirAllocator;
        this.exceptionReporter = exceptionReporter;

        this.combiner = combiner;

        this.reduceCombineInputCounter = reduceCombineInputCounter;
        this.spilledRecordsCounter = spilledRecordsCounter;
        this.mergedMapOutputsCounter = mergedMapOutputsCounter;
        this.mapOutputFile = new TezTaskOutputFiles(conf, inputContext.getUniqueIdentifier());

        this.localFS = localFS;
        this.rfs = ((LocalFileSystem) localFS).getRaw();

        if (ConfigUtils.isIntermediateInputCompressed(conf)) {
            Class<? extends CompressionCodec> codecClass = ConfigUtils.getIntermediateInputCompressorClass(conf,
                    DefaultCodec.class);
            codec = ReflectionUtils.newInstance(codecClass, conf);
        } else {
            codec = null;
        }
        this.ifileReadAhead = conf.getBoolean(TezJobConfig.TEZ_RUNTIME_IFILE_READAHEAD,
                TezJobConfig.TEZ_RUNTIME_IFILE_READAHEAD_DEFAULT);
        if (this.ifileReadAhead) {
            this.ifileReadAheadLength = conf.getInt(TezJobConfig.TEZ_RUNTIME_IFILE_READAHEAD_BYTES,
                    TezJobConfig.TEZ_RUNTIME_IFILE_READAHEAD_BYTES_DEFAULT);
        } else {
            this.ifileReadAheadLength = 0;
        }
        this.ifileBufferSize = conf.getInt("io.file.buffer.size",
                TezJobConfig.TEZ_RUNTIME_IFILE_BUFFER_SIZE_DEFAULT);

        final float maxInMemCopyUse = conf.getFloat(TezJobConfig.TEZ_RUNTIME_SHUFFLE_INPUT_BUFFER_PERCENT,
                TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_INPUT_BUFFER_PERCENT);
        if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) {
            throw new IllegalArgumentException("Invalid value for "
                    + TezJobConfig.TEZ_RUNTIME_SHUFFLE_INPUT_BUFFER_PERCENT + ": " + maxInMemCopyUse);
        }

        // Allow unit tests to fix Runtime memory
        this.memoryLimit = (long) (conf.getLong(Constants.TEZ_RUNTIME_TASK_MEMORY,
                Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE)) * maxInMemCopyUse);

        this.ioSortFactor = conf.getInt(TezJobConfig.TEZ_RUNTIME_IO_SORT_FACTOR,
                TezJobConfig.DEFAULT_TEZ_RUNTIME_IO_SORT_FACTOR);

        final float singleShuffleMemoryLimitPercent = conf.getFloat(
                TezJobConfig.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT,
                TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT);
        if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) {
            throw new IllegalArgumentException(
                    "Invalid value for " + TezJobConfig.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT + ": "
                            + singleShuffleMemoryLimitPercent);
        }

        this.maxSingleShuffleLimit = (long) (memoryLimit * singleShuffleMemoryLimitPercent);
        this.memToMemMergeOutputsThreshold = conf.getInt(TezJobConfig.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS,
                ioSortFactor);
        this.mergeThreshold = (long) (this.memoryLimit
                * conf.getFloat(TezJobConfig.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT,
                        TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT));
        LOG.info("MergerManager: memoryLimit=" + memoryLimit + ", " + "maxSingleShuffleLimit="
                + maxSingleShuffleLimit + ", " + "mergeThreshold=" + mergeThreshold + ", " + "ioSortFactor="
                + ioSortFactor + ", " + "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);

        if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
            throw new RuntimeException("Invlaid configuration: "
                    + "maxSingleShuffleLimit should be less than mergeThreshold" + "maxSingleShuffleLimit: "
                    + this.maxSingleShuffleLimit + "mergeThreshold: " + this.mergeThreshold);
        }

        boolean allowMemToMemMerge = conf.getBoolean(TezJobConfig.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM,
                TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM);
        if (allowMemToMemMerge) {
            this.memToMemMerger = new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold);
            this.memToMemMerger.start();
        } else {
            this.memToMemMerger = null;
        }

        this.inMemoryMerger = new InMemoryMerger(this);
        this.inMemoryMerger.start();

        this.onDiskMerger = new OnDiskMerger(this);
        this.onDiskMerger.start();
    }

    public void waitForInMemoryMerge() throws InterruptedException {
        inMemoryMerger.waitForMerge();
    }

    private boolean canShuffleToMemory(long requestedSize) {
        return (requestedSize < maxSingleShuffleLimit);
    }

    final private MapOutput stallShuffle = new MapOutput(null);

    public synchronized MapOutput reserve(InputAttemptIdentifier srcAttemptIdentifier, long requestedSize,
            int fetcher) throws IOException {
        if (!canShuffleToMemory(requestedSize)) {
            LOG.info(srcAttemptIdentifier + ": Shuffling to disk since " + requestedSize
                    + " is greater than maxSingleShuffleLimit (" + maxSingleShuffleLimit + ")");
            return new MapOutput(srcAttemptIdentifier, this, requestedSize, conf, localDirAllocator, fetcher, true,
                    mapOutputFile);
        }

        // Stall shuffle if we are above the memory limit

        // It is possible that all threads could just be stalling and not make
        // progress at all. This could happen when:
        //
        // requested size is causing the used memory to go above limit &&
        // requested size < singleShuffleLimit &&
        // current used size < mergeThreshold (merge will not get triggered)
        //
        // To avoid this from happening, we allow exactly one thread to go past
        // the memory limit. We check (usedMemory > memoryLimit) and not
        // (usedMemory + requestedSize > memoryLimit). When this thread is done
        // fetching, this will automatically trigger a merge thereby unlocking
        // all the stalled threads

        if (usedMemory > memoryLimit) {
            LOG.debug(srcAttemptIdentifier + ": Stalling shuffle since usedMemory (" + usedMemory
                    + ") is greater than memoryLimit (" + memoryLimit + ")." + " CommitMemory is (" + commitMemory
                    + ")");
            return stallShuffle;
        }

        // Allow the in-memory shuffle to progress
        LOG.debug(srcAttemptIdentifier + ": Proceeding with shuffle since usedMemory (" + usedMemory
                + ") is lesser than memoryLimit (" + memoryLimit + ")." + "CommitMemory is (" + commitMemory + ")");
        return unconditionalReserve(srcAttemptIdentifier, requestedSize, true);
    }

    /**
     * Unconditional Reserve is used by the Memory-to-Memory thread
     * @return
     */
    private synchronized MapOutput unconditionalReserve(InputAttemptIdentifier srcAttemptIdentifier,
            long requestedSize, boolean primaryMapOutput) {
        usedMemory += requestedSize;
        return new MapOutput(srcAttemptIdentifier, this, (int) requestedSize, primaryMapOutput);
    }

    synchronized void unreserve(long size) {
        commitMemory -= size;
        usedMemory -= size;
    }

    public synchronized void closeInMemoryFile(MapOutput mapOutput) {
        inMemoryMapOutputs.add(mapOutput);
        LOG.info("closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
                + ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size() + ", commitMemory -> "
                + commitMemory + ", usedMemory ->" + usedMemory);

        commitMemory += mapOutput.getSize();

        synchronized (inMemoryMerger) {
            // Can hang if mergeThreshold is really low.
            if (!inMemoryMerger.isInProgress() && commitMemory >= mergeThreshold) {
                LOG.info("Starting inMemoryMerger's merge since commitMemory=" + commitMemory + " > mergeThreshold="
                        + mergeThreshold + ". Current usedMemory=" + usedMemory);
                inMemoryMapOutputs.addAll(inMemoryMergedMapOutputs);
                inMemoryMergedMapOutputs.clear();
                inMemoryMerger.startMerge(inMemoryMapOutputs);
            }
        }

        if (memToMemMerger != null) {
            synchronized (memToMemMerger) {
                if (!memToMemMerger.isInProgress() && inMemoryMapOutputs.size() >= memToMemMergeOutputsThreshold) {
                    memToMemMerger.startMerge(inMemoryMapOutputs);
                }
            }
        }
    }

    public synchronized void closeInMemoryMergedFile(MapOutput mapOutput) {
        inMemoryMergedMapOutputs.add(mapOutput);
        LOG.info("closeInMemoryMergedFile -> size: " + mapOutput.getSize() + ", inMemoryMergedMapOutputs.size() -> "
                + inMemoryMergedMapOutputs.size());
    }

    public synchronized void closeOnDiskFile(Path file) {
        onDiskMapOutputs.add(file);

        synchronized (onDiskMerger) {
            if (!onDiskMerger.isInProgress() && onDiskMapOutputs.size() >= (2 * ioSortFactor - 1)) {
                onDiskMerger.startMerge(onDiskMapOutputs);
            }
        }
    }

    /**
     * Should <b>only</b> be used after the Shuffle phaze is complete, otherwise can
     * return an invalid state since a merge may not be in progress dur to
     * inadequate inputs
     * 
     * @return true if the merge process is complete, otherwise false
     */
    @Private
    public boolean isMergeComplete() {
        return finalMergeComplete;
    }

    public TezRawKeyValueIterator close() throws Throwable {
        // Wait for on-going merges to complete
        if (memToMemMerger != null) {
            memToMemMerger.close();
        }
        inMemoryMerger.close();
        onDiskMerger.close();

        List<MapOutput> memory = new ArrayList<MapOutput>(inMemoryMergedMapOutputs);
        memory.addAll(inMemoryMapOutputs);
        List<Path> disk = new ArrayList<Path>(onDiskMapOutputs);
        TezRawKeyValueIterator kvIter = finalMerge(conf, rfs, memory, disk);
        this.finalMergeComplete = true;
        return kvIter;
    }

    void runCombineProcessor(TezRawKeyValueIterator kvIter, Writer writer)
            throws IOException, InterruptedException {
        combiner.combine(kvIter, writer);
    }

    private class IntermediateMemoryToMemoryMerger extends MergeThread<MapOutput> {

        public IntermediateMemoryToMemoryMerger(MergeManager manager, int mergeFactor) {
            super(manager, mergeFactor, exceptionReporter);
            setName("InMemoryMerger - Thread to do in-memory merge of in-memory " + "shuffled map-outputs");
            setDaemon(true);
        }

        @Override
        public void merge(List<MapOutput> inputs) throws IOException {
            if (inputs == null || inputs.size() == 0) {
                return;
            }

            InputAttemptIdentifier dummyMapId = inputs.get(0).getAttemptIdentifier();
            List<Segment> inMemorySegments = new ArrayList<Segment>();
            long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0);
            int noInMemorySegments = inMemorySegments.size();

            MapOutput mergedMapOutputs = unconditionalReserve(dummyMapId, mergeOutputSize, false);

            Writer writer = new InMemoryWriter(mergedMapOutputs.getArrayStream());

            LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments + " segments of total-size: "
                    + mergeOutputSize);

            TezRawKeyValueIterator rIter = TezMerger.merge(conf, rfs,
                    ConfigUtils.getIntermediateInputKeyClass(conf),
                    ConfigUtils.getIntermediateInputValueClass(conf), inMemorySegments, inMemorySegments.size(),
                    new Path(inputContext.getUniqueIdentifier()),
                    (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(conf), nullProgressable, null,
                    null, null);
            TezMerger.writeFile(rIter, writer, nullProgressable, TezJobConfig.DEFAULT_RECORDS_BEFORE_PROGRESS);
            writer.close();

            LOG.info(inputContext.getUniqueIdentifier() + " Memory-to-Memory merge of the " + noInMemorySegments
                    + " files in-memory complete.");

            // Note the output of the merge
            closeInMemoryMergedFile(mergedMapOutputs);
        }
    }

    private class InMemoryMerger extends MergeThread<MapOutput> {

        public InMemoryMerger(MergeManager manager) {
            super(manager, Integer.MAX_VALUE, exceptionReporter);
            setName("InMemoryMerger - Thread to merge in-memory shuffled map-outputs");
            setDaemon(true);
        }

        @Override
        public void merge(List<MapOutput> inputs) throws IOException, InterruptedException {
            if (inputs == null || inputs.size() == 0) {
                return;
            }

            //name this output file same as the name of the first file that is 
            //there in the current list of inmem files (this is guaranteed to
            //be absent on the disk currently. So we don't overwrite a prev. 
            //created spill). Also we need to create the output file now since
            //it is not guaranteed that this file will be present after merge
            //is called (we delete empty files as soon as we see them
            //in the merge method)

            //figure out the mapId 
            InputAttemptIdentifier srcTaskIdentifier = inputs.get(0).getAttemptIdentifier();

            List<Segment> inMemorySegments = new ArrayList<Segment>();
            long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0);
            int noInMemorySegments = inMemorySegments.size();

            Path outputPath = mapOutputFile
                    .getInputFileForWrite(srcTaskIdentifier.getInputIdentifier().getSrcTaskIndex(), mergeOutputSize)
                    .suffix(Constants.MERGED_OUTPUT_PREFIX);

            Writer writer = null;
            try {
                writer = new Writer(conf, rfs, outputPath, (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                        (Class) ConfigUtils.getIntermediateInputValueClass(conf), codec, null);

                TezRawKeyValueIterator rIter = null;
                LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments...");

                rIter = TezMerger.merge(conf, rfs, (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                        (Class) ConfigUtils.getIntermediateInputValueClass(conf), inMemorySegments,
                        inMemorySegments.size(), new Path(inputContext.getUniqueIdentifier()),
                        (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(conf), nullProgressable,
                        spilledRecordsCounter, null, null);

                if (null == combiner) {
                    TezMerger.writeFile(rIter, writer, nullProgressable,
                            TezJobConfig.DEFAULT_RECORDS_BEFORE_PROGRESS);
                } else {
                    runCombineProcessor(rIter, writer);
                }
                writer.close();
                writer = null;

                LOG.info(inputContext.getUniqueIdentifier() + " Merge of the " + noInMemorySegments
                        + " files in-memory complete." + " Local file is " + outputPath + " of size "
                        + localFS.getFileStatus(outputPath).getLen());
            } catch (IOException e) {
                //make sure that we delete the ondisk file that we created 
                //earlier when we invoked cloneFileAttributes
                localFS.delete(outputPath, true);
                throw e;
            } finally {
                if (writer != null) {
                    writer.close();
                }
            }

            // Note the output of the merge
            closeOnDiskFile(outputPath);
        }

    }

    private class OnDiskMerger extends MergeThread<Path> {

        public OnDiskMerger(MergeManager manager) {
            super(manager, Integer.MAX_VALUE, exceptionReporter);
            setName("OnDiskMerger - Thread to merge on-disk map-outputs");
            setDaemon(true);
        }

        @Override
        public void merge(List<Path> inputs) throws IOException {
            // sanity check
            if (inputs == null || inputs.isEmpty()) {
                LOG.info("No ondisk files to merge...");
                return;
            }

            long approxOutputSize = 0;
            int bytesPerSum = conf.getInt("io.bytes.per.checksum", 512);

            LOG.info("OnDiskMerger: We have  " + inputs.size() + " map outputs on disk. Triggering merge...");

            // 1. Prepare the list of files to be merged. 
            for (Path file : inputs) {
                approxOutputSize += localFS.getFileStatus(file).getLen();
            }

            // add the checksum length
            approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum);

            // 2. Start the on-disk merge process
            Path outputPath = localDirAllocator
                    .getLocalPathForWrite(inputs.get(0).toString(), approxOutputSize, conf)
                    .suffix(Constants.MERGED_OUTPUT_PREFIX);
            Writer writer = new Writer(conf, rfs, outputPath,
                    (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                    (Class) ConfigUtils.getIntermediateInputValueClass(conf), codec, null);
            TezRawKeyValueIterator iter = null;
            Path tmpDir = new Path(inputContext.getUniqueIdentifier());
            try {
                iter = TezMerger.merge(conf, rfs, (Class) ConfigUtils.getIntermediateInputKeyClass(conf),
                        (Class) ConfigUtils.getIntermediateInputValueClass(conf), codec, ifileReadAhead,
                        ifileReadAheadLength, ifileBufferSize, inputs.toArray(new Path[inputs.size()]), true,
                        ioSortFactor, tmpDir, (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(conf),
                        nullProgressable, spilledRecordsCounter, null, mergedMapOutputsCounter, null);

                TezMerger.writeFile(iter, writer, nullProgressable, TezJobConfig.DEFAULT_RECORDS_BEFORE_PROGRESS);
                writer.close();
            } catch (IOException e) {
                localFS.delete(outputPath, true);
                throw e;
            }

            closeOnDiskFile(outputPath);

            LOG.info(inputContext.getUniqueIdentifier() + " Finished merging " + inputs.size()
                    + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is "
                    + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen());
        }
    }

    private long createInMemorySegments(List<MapOutput> inMemoryMapOutputs, List<Segment> inMemorySegments,
            long leaveBytes) throws IOException {
        long totalSize = 0L;
        // We could use fullSize could come from the RamManager, but files can be
        // closed but not yet present in inMemoryMapOutputs
        long fullSize = 0L;
        for (MapOutput mo : inMemoryMapOutputs) {
            fullSize += mo.getMemory().length;
        }
        while (fullSize > leaveBytes) {
            MapOutput mo = inMemoryMapOutputs.remove(0);
            byte[] data = mo.getMemory();
            long size = data.length;
            totalSize += size;
            fullSize -= size;
            IFile.Reader reader = new InMemoryReader(MergeManager.this, mo.getAttemptIdentifier(), data, 0,
                    (int) size);
            inMemorySegments
                    .add(new Segment(reader, true, (mo.isPrimaryMapOutput() ? mergedMapOutputsCounter : null)));
        }
        return totalSize;
    }

    class RawKVIteratorReader extends IFile.Reader {

        private final TezRawKeyValueIterator kvIter;

        public RawKVIteratorReader(TezRawKeyValueIterator kvIter, long size) throws IOException {
            super(null, size, null, spilledRecordsCounter, ifileReadAhead, ifileReadAheadLength, ifileBufferSize);
            this.kvIter = kvIter;
        }

        public boolean nextRawKey(DataInputBuffer key) throws IOException {
            if (kvIter.next()) {
                final DataInputBuffer kb = kvIter.getKey();
                final int kp = kb.getPosition();
                final int klen = kb.getLength() - kp;
                key.reset(kb.getData(), kp, klen);
                bytesRead += klen;
                return true;
            }
            return false;
        }

        public void nextRawValue(DataInputBuffer value) throws IOException {
            final DataInputBuffer vb = kvIter.getValue();
            final int vp = vb.getPosition();
            final int vlen = vb.getLength() - vp;
            value.reset(vb.getData(), vp, vlen);
            bytesRead += vlen;
        }

        public long getPosition() throws IOException {
            return bytesRead;
        }

        public void close() throws IOException {
            kvIter.close();
        }
    }

    private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs,
            List<Path> onDiskMapOutputs) throws IOException {
        LOG.info("finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and "
                + onDiskMapOutputs.size() + " on-disk map-outputs");

        final float maxRedPer = job.getFloat(TezJobConfig.TEZ_RUNTIME_INPUT_BUFFER_PERCENT,
                TezJobConfig.DEFAULT_TEZ_RUNTIME_INPUT_BUFFER_PERCENT);
        if (maxRedPer > 1.0 || maxRedPer < 0.0) {
            throw new IOException(TezJobConfig.TEZ_RUNTIME_INPUT_BUFFER_PERCENT + maxRedPer);
        }
        int maxInMemReduce = (int) Math.min(Runtime.getRuntime().maxMemory() * maxRedPer, Integer.MAX_VALUE);
        LOG.info("Memory allocated for final merge output: " + maxInMemReduce + ", using factor: " + maxRedPer);

        // merge config params
        Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
        Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
        final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
        final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);

        // segments required to vacate memory
        List<Segment> memDiskSegments = new ArrayList<Segment>();
        long inMemToDiskBytes = 0;
        boolean mergePhaseFinished = false;
        if (inMemoryMapOutputs.size() > 0) {
            int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier().getSrcTaskIndex();
            inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
            final int numMemDiskSegments = memDiskSegments.size();
            if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {

                // If we reach here, it implies that we have less than io.sort.factor
                // disk segments and this will be incremented by 1 (result of the 
                // memory segments merge). Since this total would still be 
                // <= io.sort.factor, we will not do any more intermediate merges,
                // the merge of all these disk segments would be directly fed to the
                // reduce method

                mergePhaseFinished = true;
                // must spill to disk, but can't retain in-mem for intermediate merge
                final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, inMemToDiskBytes)
                        .suffix(Constants.MERGED_OUTPUT_PREFIX);
                final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments,
                        numMemDiskSegments, tmpDir, comparator, nullProgressable, spilledRecordsCounter, null,
                        null);
                final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null);
                try {
                    TezMerger.writeFile(rIter, writer, nullProgressable,
                            TezJobConfig.DEFAULT_RECORDS_BEFORE_PROGRESS);
                    // add to list of final disk outputs.
                    onDiskMapOutputs.add(outputPath);
                } catch (IOException e) {
                    if (null != outputPath) {
                        try {
                            fs.delete(outputPath, true);
                        } catch (IOException ie) {
                            // NOTHING
                        }
                    }
                    throw e;
                } finally {
                    if (null != writer) {
                        writer.close();
                    }
                }
                LOG.info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes
                        + " bytes to disk to satisfy " + "reduce memory limit");
                inMemToDiskBytes = 0;
                memDiskSegments.clear();
            } else if (inMemToDiskBytes != 0) {
                LOG.info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes
                        + " bytes in memory for " + "intermediate, on-disk merge");
            }
        }

        // segments on disk
        List<Segment> diskSegments = new ArrayList<Segment>();
        long onDiskBytes = inMemToDiskBytes;
        Path[] onDisk = onDiskMapOutputs.toArray(new Path[onDiskMapOutputs.size()]);
        for (Path file : onDisk) {
            onDiskBytes += fs.getFileStatus(file).getLen();
            LOG.debug("Disk file: " + file + " Length is " + fs.getFileStatus(file).getLen());
            diskSegments.add(new Segment(job, fs, file, codec, ifileReadAhead, ifileReadAheadLength,
                    ifileBufferSize, false,
                    (file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter)));
        }
        LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
        Collections.sort(diskSegments, new Comparator<Segment>() {
            public int compare(Segment o1, Segment o2) {
                if (o1.getLength() == o2.getLength()) {
                    return 0;
                }
                return o1.getLength() < o2.getLength() ? -1 : 1;
            }
        });

        // build final list of segments from merged backed by disk + in-mem
        List<Segment> finalSegments = new ArrayList<Segment>();
        long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
        LOG.info("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
        if (0 != onDiskBytes) {
            final int numInMemSegments = memDiskSegments.size();
            diskSegments.addAll(0, memDiskSegments);
            memDiskSegments.clear();
            TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, diskSegments,
                    ioSortFactor, numInMemSegments, tmpDir, comparator, nullProgressable, false,
                    spilledRecordsCounter, null, null);
            diskSegments.clear();
            if (0 == finalSegments.size()) {
                return diskMerge;
            }
            finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), true));
        }
        return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir,
                comparator, nullProgressable, spilledRecordsCounter, null, null);

    }
}