de.tudarmstadt.lt.n2n.hadoop.RelationToOneHoleTransformerJob.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.hadoop.RelationToOneHoleTransformerJob.java
Source

/*
 *   Copyright 2013
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.hadoop;

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 *
 * @author Steffen Remus
 */
public class RelationToOneHoleTransformerJob extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new RelationToOneHoleTransformerJob(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), RelationToOneHoleTransformerJob.class);

        conf.setJobName(RelationToOneHoleTransformerJob.class.getSimpleName());
        args = new GenericOptionsParser(conf, args).getRemainingArgs();

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setMapperClass(RelationToOneHoleTransformerJob.Map.class);
        conf.setNumReduceTasks(0);
        // conf.setReducerClass(IdentityReducer.class); // sort or no sort? - that is here the question

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);

        return 0;
    }

    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
        private Text _key = new Text();
        private Text _value = new Text();

        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, Text> collector, Reporter reporter)
                throws IOException {
            // tab_index
            String value_as_string = value.toString();
            int first_tab_index = value_as_string.indexOf('\t');
            int second_tab_index = value_as_string.indexOf('\t', first_tab_index + 1);
            String pairs = value_as_string.substring(0, first_tab_index);
            String path = value_as_string.substring(first_tab_index + 1, second_tab_index);
            String rest = value_as_string.substring(second_tab_index);
            // pairs is in the form w1::w2
            // path is in the form @1<=x=>@2
            int pair_divider_index = pairs.indexOf("::");
            String w1 = pairs.substring(0, pair_divider_index);
            String w2 = pairs.substring(pair_divider_index + 2);
            String plain_path = path.substring(2, path.length() - 2);

            // for w1::path::w2 write three jobims:
            // @::path::w2 <tab> w1
            // w1::@::w2 <tab> path
            // w1::path::@ <tab> w2

            _key.set(String.format("@::%s::%s", plain_path, w2));
            _value.set(w1 + rest);
            collector.collect(_key, _value);

            _key.set(String.format("%s::@::%s", w1, w2));
            _value.set(plain_path + rest);
            collector.collect(_key, _value);

            _key.set(String.format("%s::%s::@", w1, plain_path));
            _value.set(w2 + rest);
            collector.collect(_key, _value);
        }
    }

}