org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tez.runtime.library.output;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.tez.common.TezCommonUtils;
import org.apache.tez.common.TezRuntimeFrameworkConfigs;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.TezUtilsInternal;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.runtime.api.AbstractLogicalOutput;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.OutputContext;
import org.apache.tez.runtime.api.events.CompositeDataMovementEvent;
import org.apache.tez.runtime.api.events.VertexManagerEvent;
import org.apache.tez.runtime.library.api.KeyValuesWriter;
import org.apache.tez.runtime.library.api.Partitioner;
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration;
import org.apache.tez.runtime.library.common.MemoryUpdateCallbackHandler;
import org.apache.tez.runtime.library.common.sort.impl.ExternalSorter;
import org.apache.tez.runtime.library.common.sort.impl.PipelinedSorter;
import org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord;
import org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord;
import org.apache.tez.runtime.library.common.sort.impl.dflt.DefaultSorter;
import org.apache.tez.runtime.library.common.shuffle.ShuffleUtils;
import org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto;
import org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.VertexManagerEventPayloadProto;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.protobuf.ByteString;

/**
 * {@link OrderedPartitionedKVOutput} is an {@link AbstractLogicalOutput} which sorts
 * key/value pairs written to it. It also partitions the output based on a
 * {@link Partitioner}
 */
@Public
public class OrderedPartitionedKVOutput extends AbstractLogicalOutput {

    private static final Log LOG = LogFactory.getLog(OrderedPartitionedKVOutput.class);

    protected ExternalSorter sorter;
    protected Configuration conf;
    protected MemoryUpdateCallbackHandler memoryUpdateCallbackHandler;
    private long startTime;
    private long endTime;
    private boolean sendEmptyPartitionDetails;
    private final AtomicBoolean isStarted = new AtomicBoolean(false);

    public OrderedPartitionedKVOutput(OutputContext outputContext, int numPhysicalOutputs) {
        super(outputContext, numPhysicalOutputs);
    }

    @Override
    public synchronized List<Event> initialize() throws IOException {
        this.startTime = System.nanoTime();
        this.conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload());
        // Initializing this parametr in this conf since it is used in multiple
        // places (wherever LocalDirAllocator is used) - TezTaskOutputFiles,
        // TezMerger, etc.
        this.conf.setStrings(TezRuntimeFrameworkConfigs.LOCAL_DIRS, getContext().getWorkDirs());
        this.memoryUpdateCallbackHandler = new MemoryUpdateCallbackHandler();
        getContext().requestInitialMemory(
                ExternalSorter.getInitialMemoryRequirement(conf, getContext().getTotalMemoryAvailableToTask()),
                memoryUpdateCallbackHandler);

        sendEmptyPartitionDetails = this.conf.getBoolean(
                TezRuntimeConfiguration.TEZ_RUNTIME_EMPTY_PARTITION_INFO_VIA_EVENTS_ENABLED,
                TezRuntimeConfiguration.TEZ_RUNTIME_EMPTY_PARTITION_INFO_VIA_EVENTS_ENABLED_DEFAULT);
        return Collections.emptyList();
    }

    @Override
    public synchronized void start() throws Exception {
        if (!isStarted.get()) {
            memoryUpdateCallbackHandler.validateUpdateReceived();
            if (this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SORT_THREADS,
                    TezRuntimeConfiguration.TEZ_RUNTIME_SORT_THREADS_DEFAULT) > 1) {
                sorter = new PipelinedSorter(getContext(), conf, getNumPhysicalOutputs(),
                        memoryUpdateCallbackHandler.getMemoryAssigned());
            } else {
                sorter = new DefaultSorter(getContext(), conf, getNumPhysicalOutputs(),
                        memoryUpdateCallbackHandler.getMemoryAssigned());
            }
            isStarted.set(true);
        }
    }

    @Override
    public synchronized KeyValuesWriter getWriter() throws IOException {
        Preconditions.checkState(isStarted.get(), "Cannot get writer before starting the Output");
        return new KeyValuesWriter() {
            @Override
            public void write(Object key, Object value) throws IOException {
                sorter.write(key, value);
            }

            @Override
            public void write(Object key, Iterable<Object> values) throws IOException {
                sorter.write(key, values);
            }
        };
    }

    @Override
    public synchronized void handleEvents(List<Event> outputEvents) {
        // Not expecting any events.
    }

    @Override
    public synchronized List<Event> close() throws IOException {
        if (sorter != null) {
            sorter.flush();
            sorter.close();
            this.endTime = System.nanoTime();
            return generateEventsOnClose();
        } else {
            LOG.warn("Attempting to close output " + getContext().getDestinationVertexName()
                    + " before it was started");
            return Collections.emptyList();
        }
    }

    protected List<Event> generateEventsOnClose() throws IOException {
        DataMovementEventPayloadProto.Builder payloadBuilder = DataMovementEventPayloadProto.newBuilder();

        boolean outputGenerated = true;
        if (sendEmptyPartitionDetails) {
            Path indexFile = sorter.getMapOutput().getOutputIndexFile();
            TezSpillRecord spillRecord = new TezSpillRecord(indexFile, conf);
            BitSet emptyPartitionDetails = new BitSet();
            int emptyPartitions = 0;
            for (int i = 0; i < spillRecord.size(); i++) {
                TezIndexRecord indexRecord = spillRecord.getIndex(i);
                if (!indexRecord.hasData()) {
                    emptyPartitionDetails.set(i);
                    emptyPartitions++;
                }
            }
            outputGenerated = (spillRecord.size() != emptyPartitions);
            if (emptyPartitions > 0) {
                ByteString emptyPartitionsBytesString = TezCommonUtils
                        .compressByteArrayToByteString(TezUtilsInternal.toByteArray(emptyPartitionDetails));
                payloadBuilder.setEmptyPartitions(emptyPartitionsBytesString);
                LOG.info("EmptyPartition bitsetSize=" + emptyPartitionDetails.cardinality() + ", numOutputs="
                        + getNumPhysicalOutputs() + ", emptyPartitions=" + emptyPartitions + ", compressedSize="
                        + emptyPartitionsBytesString.size());
            }
        }
        if (!sendEmptyPartitionDetails || outputGenerated) {
            String host = System.getenv(ApplicationConstants.Environment.NM_HOST.toString());
            ByteBuffer shuffleMetadata = getContext()
                    .getServiceProviderMetaData(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID);
            int shufflePort = ShuffleUtils.deserializeShuffleProviderMetaData(shuffleMetadata);
            payloadBuilder.setHost(host);
            payloadBuilder.setPort(shufflePort);
            payloadBuilder.setPathComponent(getContext().getUniqueIdentifier());
        }

        payloadBuilder.setRunDuration((int) ((endTime - startTime) / 1000));
        DataMovementEventPayloadProto payloadProto = payloadBuilder.build();
        ByteBuffer payload = payloadProto.toByteString().asReadOnlyByteBuffer();

        long outputSize = getContext().getCounters().findCounter(TaskCounter.OUTPUT_BYTES).getValue();
        VertexManagerEventPayloadProto.Builder vmBuilder = VertexManagerEventPayloadProto.newBuilder();
        vmBuilder.setOutputSize(outputSize);
        VertexManagerEvent vmEvent = VertexManagerEvent.create(getContext().getDestinationVertexName(),
                vmBuilder.build().toByteString().asReadOnlyByteBuffer());

        List<Event> events = Lists.newArrayListWithCapacity(getNumPhysicalOutputs() + 1);
        events.add(vmEvent);

        CompositeDataMovementEvent csdme = CompositeDataMovementEvent.create(0, getNumPhysicalOutputs(), payload);
        events.add(csdme);

        return events;
    }

    private static final Set<String> confKeys = new HashSet<String>();

    static {
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_IO_FILE_BUFFER_SIZE);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_SORT_THREADS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_COMPARATOR_CLASS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS_CODEC);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_EMPTY_PARTITION_INFO_VIA_EVENTS_ENABLED);
        confKeys.add(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT);
        confKeys.add(TezConfiguration.TEZ_COUNTERS_MAX);
        confKeys.add(TezConfiguration.TEZ_COUNTERS_GROUP_NAME_MAX_LENGTH);
        confKeys.add(TezConfiguration.TEZ_COUNTERS_COUNTER_NAME_MAX_LENGTH);
        confKeys.add(TezConfiguration.TEZ_COUNTERS_MAX_GROUPS);
    }

    // TODO Maybe add helper methods to extract keys
    // TODO Maybe add constants or an Enum to access the keys

    @InterfaceAudience.Private
    public static Set<String> getConfigurationKeySet() {
        return Collections.unmodifiableSet(confKeys);
    }
}