microbench.WordCountOnHDFSDataLocal.java Source code

Introduction

Here is the source code for microbench.WordCountOnHDFSDataLocal.java
Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package microbench;

import mpid.core.HadoopReader;
import mpid.core.HadoopWriter;
import mpid.core.MPI_D;
import mpid.core.MPI_D_Combiner;
import mpid.core.MPI_D_Exception;
import mpid.core.util.MPI_D_Constants;
import mpid.util.DataMPIUtil;
import mpid.util.hadoop.HadoopIOUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;

/**
 * A word count example of DataMPI on HDFS with data-local feature.
 */
public class WordCountOnHDFSDataLocal {
    /*
     * Hadoop configuration path should be defined
     */
    private static String confPath = null;

    private static String inDir = null, outDir = null, maxUsedMemPercent = null, partSize = null, outFileNum = null,
            spillPercent = null;

    public static class WordCountCombiner implements MPI_D_Combiner<Text, IntWritable> {
        private IntWritable result = new IntWritable();
        private List<IntWritable> results = new ArrayList<IntWritable>();

        @Override
        public Iterator<IntWritable> combine(Text key, Iterator<IntWritable> values) throws MPI_D_Exception {
            int sum = 0;
            results.clear();
            while (values.hasNext()) {
                sum += ((IntWritable) values.next()).get();
            }
            result.set(sum);
            results.add(result);
            return results.iterator();
        }

        public void close() {
            // do nothing
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException {
        try {
            parseArgs(args);
            HashMap<String, String> conf = new HashMap<String, String>();
            initConf(conf);
            MPI_D.Init(args, MPI_D.Mode.Common, conf);

            JobConf jobConf = new JobConf(confPath);
            if (MPI_D.COMM_BIPARTITE_O != null) {
                // O communicator
                int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
                int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
                if (rank == 0) {
                    DataMPIUtil.printArgs(args);
                }
                System.out.println("The O task " + rank + " of " + size + " is working...");

                HadoopReader<LongWritable, Text> reader = HadoopIOUtil.getReader(jobConf, inDir,
                        TextInputFormat.class, rank, MPI_D.COMM_BIPARTITE_O);
                Text word = new Text();
                IntWritable one = new IntWritable(1);
                LongWritable khead = reader.createKey();
                Text vhead = reader.createValue();
                while (reader.next(khead, vhead)) {
                    StringTokenizer itr = new StringTokenizer(vhead.toString());
                    while (itr.hasMoreTokens()) {
                        word.set(itr.nextToken());
                        // send key-value
                        MPI_D.Send(word, one);
                    }
                }
                reader.close();
            } else if (MPI_D.COMM_BIPARTITE_A != null) {
                // A communicator
                int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
                int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_A);
                System.out.println("The A task " + rank + " of " + size + " is working...");

                HadoopWriter<Text, IntWritable> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir, Text.class,
                        IntWritable.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A);

                Text oldKey = null;
                IntWritable valueData = new IntWritable();
                int sum = 0;
                Object[] keyValue = MPI_D.Recv();
                while (keyValue != null) {
                    Text key = (Text) keyValue[0];
                    IntWritable value = (IntWritable) keyValue[1];
                    if (oldKey == null) {
                        oldKey = key;
                        sum = value.get();
                    } else {
                        if (key.equals(oldKey)) {
                            sum += value.get();
                        } else {
                            valueData.set(sum);
                            outrw.write(oldKey, valueData);
                            oldKey = key;
                            sum = value.get();
                        }
                    }
                    keyValue = MPI_D.Recv();
                }
                if (oldKey != null) {
                    valueData.set(sum);
                    outrw.write(oldKey, valueData);
                }
                outrw.close();
            }
            MPI_D.Finalize();
        } catch (MPI_D_Exception e) {
            e.printStackTrace();
        }
    }

    private static void initConf(HashMap<String, String> conf) {
        conf.put(MPI_D_Constants.ReservedKeys.KEY_CLASS, org.apache.hadoop.io.Text.class.getName());
        conf.put(MPI_D_Constants.ReservedKeys.VALUE_CLASS, org.apache.hadoop.io.IntWritable.class.getName());
        conf.put(MPI_D_Constants.ReservedKeys.COMBINER_CLASS, WordCountCombiner.class.getName());

        if (maxUsedMemPercent != null) {
            conf.put(MPI_D_Constants.ReservedKeys.CommonModeKeys.MAX_MEM_USED_PERCENT, maxUsedMemPercent);
        }
        if (partSize != null) {
            conf.put(MPI_D_Constants.ReservedKeys.CommonModeKeys.BLOCK_SIZE, partSize);
        }
        if (outFileNum != null) {
            conf.put(MPI_D_Constants.ReservedKeys.CommonModeKeys.SEND_QUEUE_LENGTH, outFileNum);
        }
        if (spillPercent != null) {
            conf.put(MPI_D_Constants.ReservedKeys.CommonModeKeys.SPILL_PERCENT, spillPercent);
        }
    }

    private static void parseArgs(String[] args) {
        if (args.length < 3) {
            System.out.printf(
                    "ERROR: The %s example needs three parameters for HDFS conf path. e"
                            + ".g. core-site.xml, input and output direcories at least.\n",
                    WordCountOnHDFSDataLocal.class.getSimpleName());
            System.exit(-1);
        } else if (args.length == 3) {
            confPath = args[0];
            inDir = args[1];
            outDir = args[2];
        } else if (args.length <= 7) {
            // Advanced Usage
            confPath = args[0];
            inDir = args[1];
            outDir = args[2];
            maxUsedMemPercent = args[3];
            partSize = args[4];
            outFileNum = args[5];
            spillPercent = args[6];
        } else {
            System.err.println("ERROR: Error number of parameters.");
            System.exit(-1);
        }
    }
}