de.tudarmstadt.lt.nlkg.ConvertSVO.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.nlkg.ConvertSVO.java

Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.nlkg;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * @author Steffen Remus
 **/
public class ConvertSVO extends Configured implements Tool {

    private final static Logger LOG = LoggerFactory.getLogger(ConvertSVO.class);

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new ConvertSVO(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), ConvertSVO.class);
        conf.setJobName(ConvertSVO.class.getSimpleName());

        conf.setMapperClass(ConversionMapper.class);
        conf.setCombinerClass(IdentityReducer.class);
        conf.setReducerClass(IdentityReducer.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(ConvertedWritable.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);
        return 0;
    }

    public static class ConversionMapper extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, ConvertedWritable> {

        private static Text jo = new Text();
        private static ConvertedWritable bim = new ConvertedWritable();

        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, ConvertedWritable> output,
                Reporter reporter) throws IOException {
            String line = value.toString().trim();
            String[] splits = line.split("\t");
            if (splits.length < 3) {
                LOG.warn("Unexpected format (too few tab characters): '{}', line {}", line, key.get());
                return;
            }
            if (splits.length > 4) {
                LOG.warn("Unexpected format (too many tab characters): '{}', line {}", line, key.get());
                // TODO: log info unexpected format
            }
            String subj = splits[0].trim().replace(' ', '_');
            String verb = splits[1].trim().replace(' ', '_');
            String obj = splits[2].trim().replace(' ', '_');
            int count = 1;
            if (splits.length == 4) {
                try {
                    count = Integer.parseInt(splits[3].trim());
                } catch (NumberFormatException e) {
                    LOG.error("Unexpected format (unable to parse int in col 4): '{}', line {}", line, key.get());
                    return;
                }

            }

            jo.set(subj);
            bim.set(String.format("%s::%s::%s", "@", verb, obj), count);
            output.collect(jo, bim);

            jo.set(obj);
            bim.set(String.format("%s::%s::%s", subj, verb, "@"), count);
            output.collect(jo, bim);

            jo.set(verb);
            bim.set(String.format("%s::%s::%s", subj, "@", obj), count);
            output.collect(jo, bim);

        }
    }
}