Java tutorial
/* * Copyright 2013 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * * @author Steffen Remus */ public class RelationToOneHoleTransformerJob extends Configured implements Tool { public static void main(String[] args) throws Exception { int res = ToolRunner.run(new RelationToOneHoleTransformerJob(), args); System.exit(res); } @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), RelationToOneHoleTransformerJob.class); conf.setJobName(RelationToOneHoleTransformerJob.class.getSimpleName()); args = new GenericOptionsParser(conf, args).getRemainingArgs(); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(RelationToOneHoleTransformerJob.Map.class); conf.setNumReduceTasks(0); // conf.setReducerClass(IdentityReducer.class); // sort or no sort? - that is here the question conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; } public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { private Text _key = new Text(); private Text _value = new Text(); @Override public void map(LongWritable key, Text value, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException { // tab_index String value_as_string = value.toString(); int first_tab_index = value_as_string.indexOf('\t'); int second_tab_index = value_as_string.indexOf('\t', first_tab_index + 1); String pairs = value_as_string.substring(0, first_tab_index); String path = value_as_string.substring(first_tab_index + 1, second_tab_index); String rest = value_as_string.substring(second_tab_index); // pairs is in the form w1::w2 // path is in the form @1<=x=>@2 int pair_divider_index = pairs.indexOf("::"); String w1 = pairs.substring(0, pair_divider_index); String w2 = pairs.substring(pair_divider_index + 2); String plain_path = path.substring(2, path.length() - 2); // for w1::path::w2 write three jobims: // @::path::w2 <tab> w1 // w1::@::w2 <tab> path // w1::path::@ <tab> w2 _key.set(String.format("@::%s::%s", plain_path, w2)); _value.set(w1 + rest); collector.collect(_key, _value); _key.set(String.format("%s::@::%s", w1, w2)); _value.set(plain_path + rest); collector.collect(_key, _value); _key.set(String.format("%s::%s::@", w1, plain_path)); _value.set(w2 + rest); collector.collect(_key, _value); } } }