com.datasalt.pangool.benchmark.wordcount.PangoolWordCount.java Source code

Introduction

Here is the source code for com.datasalt.pangool.benchmark.wordcount.PangoolWordCount.java
Source

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.benchmark.wordcount;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.TupleReducer;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.output.HadoopOutputFormat;

/**
 * Code for solving the simple PangoolWordCount problem in Pangool.
 */
public class PangoolWordCount {

    public final static Charset UTF8 = Charset.forName("UTF-8");

    @SuppressWarnings("serial")
    public static class Split extends TupleMapper<LongWritable, Text> {

        private Tuple tuple;

        @Override
        public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException {
            if (tuple == null) {
                tuple = new Tuple(context.getTupleMRConfig().getIntermediateSchema(0));
            }

            StringTokenizer itr = new StringTokenizer(value.toString());
            tuple.set(1, 1);
            while (itr.hasMoreTokens()) {
                tuple.set(0, itr.nextToken());
                collector.write(tuple);
            }
        }
    }

    @SuppressWarnings("serial")
    public static class CountCombiner extends TupleReducer<ITuple, NullWritable> {

        @Override
        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {
            int count = 0;
            ITuple outputTuple = null;
            for (ITuple tuple : tuples) {
                outputTuple = tuple;
                count += (Integer) tuple.get(1);
            }
            outputTuple.set(1, count);
            collector.write(outputTuple, NullWritable.get());
        }
    }

    @SuppressWarnings("serial")
    public static class Count extends TupleReducer<Text, IntWritable> {
        private IntWritable outputCount;

        @Override
        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {

            if (outputCount == null) {
                outputCount = new IntWritable();
            }
            int count = 0;
            for (ITuple tuple : tuples) {
                count += (Integer) tuple.get(1);
            }
            outputCount.set(count);
            collector.write((Text) group.get(0), outputCount);
        }
    }

    public Job getJob(Configuration conf, String input, String output) throws TupleMRException, IOException {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(output), true);

        List<Field> fields = new ArrayList<Field>();
        fields.add(Field.create("word", Type.STRING));
        fields.add(Field.create("count", Type.INT));
        Schema schema = new Schema("schema", fields);

        TupleMRBuilder cg = new TupleMRBuilder(conf, "Pangool WordCount");
        cg.addIntermediateSchema(schema);
        cg.setGroupByFields("word");
        cg.setJarByClass(PangoolWordCount.class);
        cg.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new Split());
        cg.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, Text.class);
        cg.setTupleReducer(new Count());
        cg.setTupleCombiner(new CountCombiner());

        return cg.createJob();
    }

    private static final String HELP = "Usage: PangoolWordCount [input_path] [output_path]";

    public static void main(String args[])
            throws TupleMRException, IOException, InterruptedException, ClassNotFoundException {
        if (args.length != 2) {
            System.err.println("Wrong number of arguments");
            System.err.println(HELP);
            System.exit(-1);
        }

        Configuration conf = new Configuration();
        new PangoolWordCount().getJob(conf, args[0], args[1]).waitForCompletion(true);
    }
}