org.apache.tez.Tez_1494.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tez.Tez_1494.java

Source

package org.apache.tez;

import com.google.common.base.Preconditions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.Records;
import org.apache.tez.client.TezClient;
import org.apache.tez.common.TezCommonUtils;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.Edge;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.api.client.Progress;
import org.apache.tez.dag.api.client.StatusGetOpts;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.mapreduce.processor.SimpleMRProcessor;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.ProcessorContext;
import org.apache.tez.runtime.api.Reader;
import org.apache.tez.runtime.api.Writer;
import org.apache.tez.runtime.library.api.KeyValueReader;
import org.apache.tez.runtime.library.api.KeyValueWriter;
import org.apache.tez.runtime.library.api.KeyValuesReader;
import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig;
import org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig;
import org.apache.tez.runtime.library.partitioner.HashPartitioner;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Simple test case for reproducing TEZ-1494 issue
 *
 * Create 2 sample text files and upload to HDFS
 * Used the following command to run the test.  Use -Dtez.shuffle-vertex-manager.enable
 * .auto-parallel=true to reproduce the issue
 * HADOOP_USER_CLASSPATH_FIRST=true HADOOP_CLASSPATH=/root/tez-1494/tez_1494.jar:/root/tez-autobuild/dist/tez/*:/root/tez-autobuild/dist/tez/lib/*:/root/tez-autobuild/dist/tez/conf:$HADOOP_CLASSPATH yarn jar /root/tez-1494/tez_1494.jar org.apache.tez.Tez_1494 -Dtez.shuffle-vertex-manager.enable.auto-parallel=true /tmp/map_2.out /tmp/map_7.out /tmp/test.out.1484
 */
public class Tez_1494 extends Configured implements Tool {

    static final String PROCESSOR_NAME = "processorName";
    static final String SLEEP_INTERVAL = "sleepInterval";

    public static class DummyProcessor extends SimpleMRProcessor {

        private static final Log LOG = LogFactory.getLog(DummyProcessor.class);
        Configuration conf;
        String processorName;
        String logId;
        long sleepInterval;

        public DummyProcessor(ProcessorContext context) {
            super(context);
        }

        @Override
        public void run() throws Exception {
            LOG.info("Emitting zero records : " + getContext().getTaskIndex() + " : taskVertexIndex:"
                    + getContext().getTaskVertexIndex() + "; " + logId);
            if (sleepInterval > 0) {
                LOG.info("Sleeping for " + sleepInterval + "; " + logId);
                Thread.sleep(sleepInterval);
            }
        }

        @Override
        public void initialize() throws Exception {
            super.initialize();
            if (getContext().getUserPayload() != null) {
                conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload());
                this.processorName = conf.get(PROCESSOR_NAME);
                this.logId = this.processorName;
                this.sleepInterval = conf.getLong(SLEEP_INTERVAL, 0);
            }
        }
    }

    /**
     * Echo processor with single output writer (Good enough for this test)
     */
    public static class EchoProcessor extends SimpleMRProcessor {

        private static final Log LOG = LogFactory.getLog(EchoProcessor.class);

        private String processorName;
        private String logId;

        public EchoProcessor(ProcessorContext context) {
            super(context);
        }

        @Override
        public void initialize() throws Exception {
            super.initialize();
            if (getContext().getUserPayload() != null) {
                Configuration conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload());
                this.processorName = conf.get(PROCESSOR_NAME);
                this.logId = this.processorName;
            }
        }

        @Override
        public void run() throws Exception {
            LOG.info(processorName + " Inputs : " + getInputs().values() + "; outputs: " + getOutputs().values());
            Writer writer = getOutputs().values().iterator().next().getWriter();

            Iterator<LogicalInput> inputIterator = getInputs().values().iterator();
            while (inputIterator.hasNext()) {
                Reader reader = inputIterator.next().getReader();
                //TEZ-1503
                if (reader instanceof KeyValueReader) {
                    copy((KeyValueReader) reader, (KeyValueWriter) writer);
                }
                if (reader instanceof KeyValuesReader) {
                    copy((KeyValuesReader) reader, (KeyValueWriter) writer);
                }
            }
        }
    }

    static void copy(KeyValuesReader reader, KeyValueWriter writer) throws IOException {
        while (reader.next()) {
            for (Object val : reader.getCurrentValues()) {
                writer.write(reader.getCurrentKey(), val);
            }
        }
    }

    static void copy(KeyValueReader reader, KeyValueWriter writer) throws IOException {
        while (reader.next()) {
            writer.write(reader.getCurrentKey(), reader.getCurrentValue());
        }
    }

    public static Path getCurrentJarURL() throws URISyntaxException {
        return new Path(Tez_1494.class.getProtectionDomain().getCodeSource().getLocation().toURI());
    }

    private Map<String, LocalResource> getLocalResources(TezConfiguration tezConf)
            throws IOException, URISyntaxException {
        Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
        Path stagingDir = TezCommonUtils.getTezBaseStagingPath(tezConf);

        // staging dir
        FileSystem fs = FileSystem.get(tezConf);
        Path jobJar = new Path(stagingDir, "job.jar");
        if (fs.exists(jobJar)) {
            fs.delete(jobJar, false);
        }
        fs.copyFromLocalFile(getCurrentJarURL(), jobJar);

        localResources.put("job.jar", createLocalResource(fs, jobJar));
        return localResources;
    }

    private LocalResource createLocalResource(FileSystem fs, Path file) throws IOException {
        final LocalResourceType type = LocalResourceType.FILE;
        final LocalResourceVisibility visibility = LocalResourceVisibility.APPLICATION;
        FileStatus fstat = fs.getFileStatus(file);
        org.apache.hadoop.yarn.api.records.URL resourceURL = ConverterUtils.getYarnUrlFromPath(file);
        long resourceSize = fstat.getLen();
        long resourceModificationTime = fstat.getModificationTime();
        LocalResource lr = Records.newRecord(LocalResource.class);
        lr.setResource(resourceURL);
        lr.setType(type);
        lr.setSize(resourceSize);
        lr.setVisibility(visibility);
        lr.setTimestamp(resourceModificationTime);
        return lr;
    }

    private ProcessorDescriptor createDescriptor(String processorClassName) throws IOException {
        ProcessorDescriptor descriptor = ProcessorDescriptor.create(processorClassName);

        UserPayload payload = TezUtils.createUserPayloadFromConf(getConf());
        //not the best approach to send the entire config in payload.  But for testing, its fine
        descriptor.setUserPayload(payload);
        return descriptor;
    }

    private Vertex createVertex(String vName, String pName, String pClassName, String dataSource,
            DataSourceDescriptor dsource, String dataSink, DataSinkDescriptor dsink, int parallelism,
            long sleepInterval) throws IOException {
        parallelism = (parallelism <= 0) ? -1 : parallelism;
        sleepInterval = (sleepInterval <= 0) ? 0 : sleepInterval;
        getConf().set(PROCESSOR_NAME, pName);
        getConf().setLong(SLEEP_INTERVAL, sleepInterval);
        Vertex v = Vertex.create(vName, createDescriptor(pClassName), parallelism);
        if (dataSource != null) {
            Preconditions.checkArgument(dsource != null, "datasource can't be null");
            v.addDataSource(dataSource, dsource);
        }

        if (dataSink != null) {
            Preconditions.checkArgument(dsink != null, "datasink can't be null");
            v.addDataSink(dataSink, dsink);
        }
        return v;
    }

    public DAG createDAG(String map_2_input, String map_7_input, String output, TezConfiguration tezConf)
            throws IOException, URISyntaxException {

        List<Vertex> vertexList = new ArrayList<Vertex>();
        List<Edge> edgeList = new ArrayList<Edge>();

        /**
         * source1 --> Map_2 --(Scatter_Gather)--> Reducer_3
         * source2 --> Map_7 ----(unordered)---\\
         *                                       ==> Map_5
         * Reducer_3 -------(unordered)--------//
         * Map_5 --(Scatter_Gather)---> Reducer_6
         * Reducer_6 --> Sink
         */
        DataSourceDescriptor map_2_dataDescriptor = MRInput
                .createConfigBuilder(getConf(), TextInputFormat.class, map_2_input).build();
        DataSourceDescriptor map_7_dataDescriptor = MRInput
                .createConfigBuilder(getConf(), TextInputFormat.class, map_7_input).build();

        DataSinkDescriptor sinkDescriptor = MROutput.createConfigBuilder(getConf(), TextOutputFormat.class, output)
                .build();

        String name = "Map_2";
        Vertex map_2 = createVertex(name, name, DummyProcessor.class.getName(), "Map_2_input", map_2_dataDescriptor,
                null, null, -1, 0);
        vertexList.add(map_2);

        name = "Reducer_3";
        Vertex reducer_3 = createVertex(name, name, DummyProcessor.class.getName(), null, null, null, null, 2,
                10000);
        vertexList.add(reducer_3);

        name = "Map_5";
        Vertex map_5 = createVertex(name, name, EchoProcessor.class.getName(), null, null, null, null, 10, 0);
        vertexList.add(map_5);

        name = "Map_7";
        Vertex map_7 = createVertex(name, name, EchoProcessor.class.getName(), "Map_7_input", map_7_dataDescriptor,
                null, null, -1, 0);
        vertexList.add(map_7);

        name = "Reducer_6";
        Vertex reducer_6 = createVertex(name, name, EchoProcessor.class.getName(), null, null, "Reducer_6_sink",
                sinkDescriptor, 1, 0);
        vertexList.add(reducer_6);

        Edge m2_r3_edge = Edge.create(map_2, reducer_3, OrderedPartitionedKVEdgeConfig
                .newBuilder(LongWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName())
                .build().createDefaultEdgeProperty());

        Edge m7_m5_edge = Edge.create(map_7, map_5,
                UnorderedKVEdgeConfig.newBuilder(LongWritable.class.getName(), Text.class.getName()).build()
                        .createDefaultBroadcastEdgeProperty());

        Edge r3_m5_edge = Edge.create(reducer_3, map_5,
                UnorderedKVEdgeConfig.newBuilder(LongWritable.class.getName(), Text.class.getName()).build()
                        .createDefaultBroadcastEdgeProperty());

        Edge m5_r6_edge = Edge.create(map_5, reducer_6, OrderedPartitionedKVEdgeConfig
                .newBuilder(LongWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName())
                .build().createDefaultEdgeProperty());

        edgeList.add(m2_r3_edge);
        edgeList.add(m7_m5_edge);
        edgeList.add(r3_m5_edge);
        edgeList.add(m5_r6_edge);

        DAG dag = DAG.create("TEZ_1494");
        dag.addTaskLocalFiles(getLocalResources(tezConf));
        for (Vertex v : vertexList) {
            dag.addVertex(v);
        }

        for (Edge edge : edgeList) {
            dag.addEdge(edge);
        }

        return dag;
    }

    @Override
    public int run(String[] args) throws Exception {
        TezConfiguration tezConf = new TezConfiguration(getConf());

        String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs();

        Preconditions.checkArgument(otherArgs != null && otherArgs.length == 3,
                "Please provide valid map_2_input map_7_input reducer_output path");

        String map_2_input = otherArgs[0];
        String map_7_input = otherArgs[1];
        String output = otherArgs[2];

        TezClient tezClient = TezClient.create("Tez_1494", tezConf, false);
        tezClient.start();
        DAG dag = createDAG(map_2_input, map_7_input, output, tezConf);

        AtomicBoolean shutdown = new AtomicBoolean(false);
        try {
            DAGClient statusClient = tezClient.submitDAG(dag);

            //Start monitor (note: not shutting down thread)
            Monitor monitor = new Monitor(statusClient, shutdown);

            DAGStatus status = statusClient
                    .waitForCompletionWithStatusUpdates(EnumSet.of(StatusGetOpts.GET_COUNTERS));

            return (status.getState() == DAGStatus.State.SUCCEEDED) ? 0 : -1;
        } finally {
            if (tezClient != null) {
                tezClient.stop();
            }
            shutdown.set(true);
        }
    }

    //Most of this is borrowed from Hive. Trimmed down to fit this test
    static class Monitor extends Thread {
        DAGClient client;
        AtomicBoolean shutdown;

        public Monitor(DAGClient client, AtomicBoolean shutdown) {
            this.client = client;
            this.shutdown = shutdown;
            this.start();
        }

        public void run() {
            try {
                report(client);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        public void report(DAGClient dagClient) throws Exception {

            Set<StatusGetOpts> opts = new HashSet<StatusGetOpts>();
            DAGStatus.State lastState = null;
            String lastReport = null;

            while (!shutdown.get()) {
                try {

                    DAGStatus status = dagClient.getDAGStatus(opts);
                    Map<String, Progress> progressMap = status.getVertexProgress();
                    DAGStatus.State state = status.getState();

                    if (state != lastState || state == state.RUNNING) {
                        lastState = state;

                        switch (state) {
                        case SUBMITTED:
                            System.out.println("Status: Submitted");
                            break;
                        case INITING:
                            System.out.println("Status: Initializing");
                            break;
                        case RUNNING:
                            printStatus(progressMap, lastReport);
                            break;
                        case SUCCEEDED:
                            System.out.println("Done!!!..Succeeded");
                            printStatus(progressMap, lastReport);
                            break;
                        }
                    }
                    Thread.sleep(2000);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }

        private void printStatus(Map<String, Progress> progressMap, String lastReport) {
            StringBuffer reportBuffer = new StringBuffer();

            SortedSet<String> keys = new TreeSet<String>(progressMap.keySet());
            for (String s : keys) {
                Progress progress = progressMap.get(s);
                final int complete = progress.getSucceededTaskCount();
                final int total = progress.getTotalTaskCount();
                final int running = progress.getRunningTaskCount();
                final int failed = progress.getFailedTaskCount();
                if (total <= 0) {
                    reportBuffer.append(String.format("%s: -/-\t", s, complete, total));
                } else {
                    if (complete < total && (complete > 0 || running > 0 || failed > 0)) {
                        /* vertex is started, but not complete */
                        if (failed > 0) {
                            reportBuffer.append(
                                    String.format("%s: %d(+%d,-%d)/%d\t", s, complete, running, failed, total));
                        } else {
                            reportBuffer.append(String.format("%s: %d(+%d)/%d\t", s, complete, running, total));
                        }
                    } else {
                        /* vertex is waiting for input/slots or complete */
                        if (failed > 0) {
                            /* tasks finished but some failed */
                            reportBuffer.append(String.format("%s: %d(-%d)/%d\t", s, complete, failed, total));
                        } else {
                            reportBuffer.append(String.format("%s: %d/%d\t", s, complete, total));
                        }
                    }
                }
            }

            String report = reportBuffer.toString();
            System.out.println(report);
        }
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(), new Tez_1494(), args);
    }
}