jobimtext.thesaurus.distributional.hadoop.mapreduce.CleanContext.java Source code

Introduction

Here is the source code for jobimtext.thesaurus.distributional.hadoop.mapreduce.CleanContext.java
Source

/*******************************************************************************
 ** /*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package jobimtext.thesaurus.distributional.hadoop.mapreduce;

import java.io.IOException;

import jobimtext.thesaurus.distributional.hadoop.SumReducer.IntSumReducer;
import jobimtext.thesaurus.distributional.hadoop.util.HadoopUtil;
import jobimtext.thesaurus.distributional.hadoop.util.StringUtil;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

/**
 * This MapReducer trims whitespaces from each word in the value. It is
 * applicable for the context input files in the format:
 * 
 * Value: string \t string ..
 * 
 * Key: int
 * 
 * @author Martin Riedl
 * 
 */
public class CleanContext {

    /**
     * The mapper takes the feature and its word list (all the words having this
     * feature) as input It then compares all words pair-wise and outputs 1/n,
     * with n being the number of words.
     * 
     */
    @SuppressWarnings("deprecation")
    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {

            int linesize;
            // split line into tokens
            // the first is the word feature, the rest are all
            // word having this feature
            String[] line = value.toString().split("\t");
            String newLine = "";
            linesize = line.length;
            for (int i = 0; i < linesize - 1; i++) {

                String l = line[i];
                String ln = StringUtil.trim(l);
                int li = l.lastIndexOf("#");
                String w = ln.substring(0, li);
                String p = ln.substring(li + 1);
                String wt = StringUtil.trim(w);
                if (wt.length() == 0) {
                    System.out.println("Neglected: ^" + l + "$" + "\t" + "^" + ln + "$" + "\t" + line);
                    return;
                }

                newLine += wt + "#" + p + "\t";
            }
            output.collect(new Text(newLine.substring(0, newLine.length() - 1)),
                    new IntWritable(Integer.parseInt(line[linesize - 1])));

        } // map()

    } // class map

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws Exception {
        JobConf conf = HadoopUtil.generateJobConf(args);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setMapperClass(Map.class);

        conf.setCombinerClass(IntSumReducer.class);
        conf.setReducerClass(IntSumReducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        /* number of milliseconds before killing a not responding task */
        conf.set("mapred.task.timeout", "600000");

        /* change to 128mb */
        conf.set("dfs.block.size", "134217728");

        /* set the maximum number of task per node */
        int maptasks = 100;

        /*
         * Number of map tasks to deploy on each machine. 0.5 to 2 *
         * (cores/node)
         */
        conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
        conf.set("mapred.tasktracker.map", "" + maptasks);
        /*
         * The default number of map tasks per job. Typically set to a prime
         * several times greater than number of available hosts.
         */
        conf.set("mapred.map.tasks", "" + maptasks);

        int reducetasks = 120;

        conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
        conf.set("mapred.tasktracker.reduce", "" + reducetasks);
        conf.set("mapred.reduce.tasks", "" + reducetasks);
        conf.set("mapred.job.map.memory.mb", "3000");
        conf.set("mapred.job.reduce.memory.mb", "3000");
        JobClient.runJob(conf);

    } // main

} // class SimCounts