Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.nlkg; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author Steffen Remus **/ public class ConvertInvertSVO extends Configured implements Tool { private final static Logger LOG = LoggerFactory.getLogger(ConvertInvertSVO.class); public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new ConvertInvertSVO(), args); System.exit(res); } @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), ConvertInvertSVO.class); conf.setJobName(ConvertInvertSVO.class.getSimpleName()); conf.setMapperClass(ConversionMapper.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ConvertedWritable.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; } public static class ConversionMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, ConvertedWritable> { private static Text jo = new Text(); private static ConvertedWritable bim = new ConvertedWritable(); @Override public void map(LongWritable key, Text value, OutputCollector<Text, ConvertedWritable> output, Reporter reporter) throws IOException { String line = value.toString().trim(); String[] splits = line.split("\t"); if (splits.length < 3) { LOG.warn("Unexpected format (too few tab characters): '{}', line {}", line, key.get()); return; } if (splits.length > 4) { LOG.warn("Unexpected format (too many tab characters): '{}', line {}", line, key.get()); // TODO: log info unexpected format } String subj = splits[0].trim().replace(' ', '_'); String verb = splits[1].trim().replace(' ', '_'); String obj = splits[2].trim().replace(' ', '_'); int count = 1; if (splits.length == 4) { try { count = Integer.parseInt(splits[3].trim()); } catch (NumberFormatException e) { LOG.error("Unexpected format (unable to parse int in col 4): '{}', line {}", line, key.get()); return; } } // for every svo triple write a jobim pair with the hole in S position, O position, V position jo.set(verb); bim.set(String.format("%s::%s::%s", subj, "@", obj), count); output.collect(jo, bim); jo.set(subj); bim.set(String.format("%s::%s::%s", "@", verb, obj), count); output.collect(jo, bim); jo.set(obj); bim.set(String.format("%s::%s::%s", subj, verb, "@"), count); output.collect(jo, bim); // invert svo to ov^-1s triples and do the same verb = verb + "^-1"; jo.set(verb); bim.set(String.format("%s::%s::%s", obj, "@", subj), count); output.collect(jo, bim); jo.set(obj); bim.set(String.format("%s::%s::%s", "@", verb, subj), count); output.collect(jo, bim); jo.set(subj); bim.set(String.format("%s::%s::%s", obj, verb, "@"), count); output.collect(jo, bim); } } }