org.bigdata.ComplexInvertIndex.java Source code

Introduction

Here is the source code for org.bigdata.ComplexInvertIndex.java
Source

/*
 *   Copyright 2013-2014 CloudAtlas, Chengdu, China. All rights reserved.
 *
 *   Author: 0xC000005
 *   Email: fle4y@outlook.com
 *   Url: https://github.com/0xC000005
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.bigdata;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.bigdata.util.HadoopConfig;

/**
 *  
 *  ??
 * 
 * @author   : 0xC000005
 * @mailto   : fle4y@outlook.com
 * @blog      : http://0xC000005.github.io/
 * @since     : 2015521
 *
 */

public class ComplexInvertIndex {

    private static class FileNameRecordReader extends RecordReader<Text, Text> {

        String fileName;
        LineRecordReader lrr = new LineRecordReader();

        @Override
        public void close() throws IOException {
            lrr.close();
        }

        @Override
        public Text getCurrentKey() throws IOException, InterruptedException {
            return new Text(fileName);
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return lrr.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return lrr.getProgress();
        }

        @Override
        public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
            lrr.initialize(arg0, arg1);
            fileName = ((FileSplit) arg0).getPath().getName();
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return lrr.nextKeyValue();
        }
    }

    private static class FileNameInputFormat extends FileInputFormat<Text, Text> {
        @Override
        public RecordReader<Text, Text> createRecordReader(InputSplit arg0, TaskAttemptContext arg1)
                throws IOException, InterruptedException {
            FileNameRecordReader fnrr = new FileNameRecordReader();
            fnrr.initialize(arg0, arg1);
            return fnrr;
        }
    }

    private static class InvertIndexMapper extends Mapper<Text, Text, Text, IntWritable> {
        @Override
        protected void map(Text key, Text value, Mapper<Text, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String[] strs = value.toString().split(" ");
            for (String str : strs) {
                context.write(new Text(str + "#" + key.toString()), new IntWritable(1));
            }
        }
    }

    private static class InvertIndexCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {

            int sum = 0;
            for (IntWritable value : values) {
                sum = sum + value.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    private static class InvertIndexPartitioner extends HashPartitioner<Text, IntWritable> {
        @Override
        public int getPartition(Text key, IntWritable value, int numReduceTasks) {
            String[] strs = key.toString().split("#");
            return super.getPartition(new Text(strs[0]), value, numReduceTasks);
        }

    }

    private static class InvertIndexReducer extends Reducer<Text, IntWritable, Text, Text> {

        static Map<String, String> outputs = new HashMap<String, String>();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, Text>.Context context) throws IOException, InterruptedException {
            String[] strs = key.toString().split("#");
            String word = strs[0];
            String doc = strs[1];
            int sum = 0;
            for (IntWritable value : values) {
                sum = sum + value.get();
            }
            if (outputs.get(word) == null) {
                outputs.put(word, " (" + doc + "," + sum + ") ");
            } else {
                outputs.put(word, outputs.get(word) + " (" + doc + "," + sum + ") ");
            }
        }

        @Override
        protected void cleanup(Reducer<Text, IntWritable, Text, Text>.Context context)
                throws IOException, InterruptedException {
            for (String key : outputs.keySet()) {
                context.write(new Text(key), new Text(outputs.get(key)));
            }
        }

    }

    public static void main(String[] args) throws Exception {
        Configuration config = HadoopConfig.getConfig();
        Job job = Job.getInstance(config, "??");
        job.setJarByClass(ComplexInvertIndex.class);
        job.setInputFormatClass(FileNameInputFormat.class);
        job.setMapperClass(InvertIndexMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setCombinerClass(InvertIndexCombiner.class);
        job.setReducerClass(InvertIndexReducer.class);
        job.setPartitionerClass(InvertIndexPartitioner.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path("/input"));
        FileOutputFormat.setOutputPath(job, new Path("/output/"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

}