org.apache.tez.mapreduce.examples.UnionExample.java Source code

Introduction

Here is the source code for org.apache.tez.mapreduce.examples.UnionExample.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tez.mapreduce.examples;

import java.io.IOException;
import java.util.EnumSet;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.tez.client.TezClient;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.GroupInputEdge;
import org.apache.tez.dag.api.VertexGroup;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.Edge;
import org.apache.tez.dag.api.InputDescriptor;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.api.client.StatusGetOpts;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.mapreduce.processor.SimpleMRProcessor;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.Output;
import org.apache.tez.runtime.api.ProcessorContext;
import org.apache.tez.runtime.library.api.KeyValueReader;
import org.apache.tez.runtime.library.api.KeyValueWriter;
import org.apache.tez.runtime.library.api.KeyValuesReader;
import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig;
import org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValuesInput;
import org.apache.tez.runtime.library.partitioner.HashPartitioner;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;

public class UnionExample {

    public static class TokenProcessor extends SimpleMRProcessor {
        IntWritable one = new IntWritable(1);
        Text word = new Text();

        public TokenProcessor(ProcessorContext context) {
            super(context);
        }

        @Override
        public void run() throws Exception {
            Preconditions.checkArgument(getInputs().size() == 1);
            boolean inUnion = true;
            if (getContext().getTaskVertexName().equals("map3")) {
                inUnion = false;
            }
            Preconditions.checkArgument(getOutputs().size() == (inUnion ? 2 : 1));
            Preconditions.checkArgument(getOutputs().containsKey("checker"));
            MRInput input = (MRInput) getInputs().values().iterator().next();
            KeyValueReader kvReader = input.getReader();
            Output output = getOutputs().get("checker");
            KeyValueWriter kvWriter = (KeyValueWriter) output.getWriter();
            MROutput parts = null;
            KeyValueWriter partsWriter = null;
            if (inUnion) {
                parts = (MROutput) getOutputs().get("parts");
                partsWriter = parts.getWriter();
            }
            while (kvReader.next()) {
                StringTokenizer itr = new StringTokenizer(kvReader.getCurrentValue().toString());
                while (itr.hasMoreTokens()) {
                    word.set(itr.nextToken());
                    kvWriter.write(word, one);
                    if (inUnion) {
                        partsWriter.write(word, one);
                    }
                }
            }
        }

    }

    public static class UnionProcessor extends SimpleMRProcessor {
        IntWritable one = new IntWritable(1);

        public UnionProcessor(ProcessorContext context) {
            super(context);
        }

        @Override
        public void run() throws Exception {
            Preconditions.checkArgument(getInputs().size() == 2);
            Preconditions.checkArgument(getOutputs().size() == 2);
            MROutput out = (MROutput) getOutputs().get("union");
            MROutput allParts = (MROutput) getOutputs().get("all-parts");
            KeyValueWriter kvWriter = out.getWriter();
            KeyValueWriter partsWriter = allParts.getWriter();
            Map<String, AtomicInteger> unionKv = Maps.newHashMap();
            LogicalInput union = getInputs().get("union");
            KeyValuesReader kvReader = (KeyValuesReader) union.getReader();
            while (kvReader.next()) {
                String word = ((Text) kvReader.getCurrentKey()).toString();
                IntWritable intVal = (IntWritable) kvReader.getCurrentValues().iterator().next();
                for (int i = 0; i < intVal.get(); ++i) {
                    partsWriter.write(word, one);
                }
                AtomicInteger value = unionKv.get(word);
                if (value == null) {
                    unionKv.put(word, new AtomicInteger(intVal.get()));
                } else {
                    value.addAndGet(intVal.get());
                }
            }
            LogicalInput map3 = getInputs().get("map3");
            kvReader = (KeyValuesReader) map3.getReader();
            while (kvReader.next()) {
                String word = ((Text) kvReader.getCurrentKey()).toString();
                IntWritable intVal = (IntWritable) kvReader.getCurrentValues().iterator().next();
                AtomicInteger value = unionKv.get(word);
                if (value == null) {
                    throw new TezUncheckedException("Expected to exist: " + word);
                } else {
                    value.getAndAdd(intVal.get() * -2);
                }
            }
            for (AtomicInteger value : unionKv.values()) {
                if (value.get() != 0) {
                    throw new TezUncheckedException("Unexpected non-zero value");
                }
            }
            kvWriter.write("Union", new IntWritable(unionKv.size()));
        }

    }

    private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources,
            Path stagingDir, String inputPath, String outputPath) throws IOException {
        DAG dag = DAG.create("UnionExample");

        int numMaps = -1;
        Configuration inputConf = new Configuration(tezConf);
        inputConf.setBoolean("mapred.mapper.new-api", false);
        inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
        inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
        MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
        DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

        Vertex mapVertex1 = Vertex
                .create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps)
                .addDataSource("MRInput", dataSource);

        Vertex mapVertex2 = Vertex
                .create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps)
                .addDataSource("MRInput", dataSource);

        Vertex mapVertex3 = Vertex
                .create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps)
                .addDataSource("MRInput", dataSource);

        Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()),
                1);

        Configuration outputConf = new Configuration(tezConf);
        outputConf.setBoolean("mapred.reducer.new-api", false);
        outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
        outputConf.set(FileOutputFormat.OUTDIR, outputPath);
        DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
        checkerVertex.addDataSink("union", od);

        Configuration allPartsConf = new Configuration(tezConf);
        DataSinkDescriptor od2 = MROutput
                .createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
        checkerVertex.addDataSink("all-parts", od2);

        Configuration partsConf = new Configuration(tezConf);
        DataSinkDescriptor od1 = MROutput
                .createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
        VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
        unionVertex.addDataSink("parts", od1);

        OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
                .newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName())
                .build();

        dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex)
                .addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
                .addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
                        InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
        return dag;
    }

    private static void printUsage() {
        System.err.println("Usage: " + " unionexample <in1> <out1>");
    }

    public boolean run(String inputPath, String outputPath, Configuration conf) throws Exception {
        System.out.println("Running UnionExample");
        // conf and UGI
        TezConfiguration tezConf;
        if (conf != null) {
            tezConf = new TezConfiguration(conf);
        } else {
            tezConf = new TezConfiguration();
        }
        UserGroupInformation.setConfiguration(tezConf);
        String user = UserGroupInformation.getCurrentUser().getShortUserName();

        // staging dir
        FileSystem fs = FileSystem.get(tezConf);
        String stagingDirStr = Path.SEPARATOR + "user" + Path.SEPARATOR + user + Path.SEPARATOR + ".staging"
                + Path.SEPARATOR + Path.SEPARATOR + Long.toString(System.currentTimeMillis());
        Path stagingDir = new Path(stagingDirStr);
        tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDirStr);
        stagingDir = fs.makeQualified(stagingDir);

        // No need to add jar containing this class as assumed to be part of
        // the tez jars.

        // TEZ-674 Obtain tokens based on the Input / Output paths. For now assuming staging dir
        // is the same filesystem as the one used for Input/Output.

        TezClient tezSession = TezClient.create("UnionExampleSession", tezConf);
        tezSession.start();

        DAGClient dagClient = null;

        try {
            if (fs.exists(new Path(outputPath))) {
                throw new FileAlreadyExistsException("Output directory " + outputPath + " already exists");
            }

            Map<String, LocalResource> localResources = new TreeMap<String, LocalResource>();

            DAG dag = createDAG(fs, tezConf, localResources, stagingDir, inputPath, outputPath);

            tezSession.waitTillReady();
            dagClient = tezSession.submitDAG(dag);

            // monitoring
            DAGStatus dagStatus = dagClient
                    .waitForCompletionWithStatusUpdates(EnumSet.of(StatusGetOpts.GET_COUNTERS));
            if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) {
                System.out.println("DAG diagnostics: " + dagStatus.getDiagnostics());
                return false;
            }
            return true;
        } finally {
            fs.delete(stagingDir, true);
            tezSession.stop();
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            printUsage();
            System.exit(2);
        }
        UnionExample job = new UnionExample();
        job.run(args[0], args[1], null);
    }
}