de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob2.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob2.java
Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.hadoop.pipetests;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ComparisonChain;

import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline;
import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;

/**
 * 
 * @author Steffen Remus
 */
public class GoogleSyntacticsJob2 extends Configured implements Tool {

    private static Logger LOG = LoggerFactory.getLogger(GoogleSyntacticsJob2.class);

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new GoogleSyntacticsJob2(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob2.class);
        conf.setJobName(GoogleSyntacticsJob2.class.getSimpleName());

        conf.setMapperClass(GoogleSyntacticsJob2Mapper.class);
        conf.setReducerClass(GoogleSyntacticsJob2Reducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        // conf.setMapOutputKeyClass(Text.class);
        // conf.setMapOutputValueClass(NullWritable.class);

        conf.setOutputKeyClass(JoBimFormat.class);
        conf.setOutputValueClass(IntWritable.class);

        args = new GenericOptionsParser(conf, args).getRemainingArgs();
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        // delete output path for testing purposes
        // FileSystem.get(conf).delete(new Path(args[1]), true);

        String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
        if (extractorConfigurationFiles == null) {
            extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
            System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                    SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
            conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        }

        String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
        for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
            DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

        JobClient.runJob(conf);
        return 0;
    }

    public static class GoogleSyntacticsJob2Mapper extends MapReduceBase
            implements Mapper<LongWritable, Text, JoBimFormat, IntWritable> {

        File _tempfile;
        AnalysisEngine _engine;

        @Override
        public void configure(JobConf job) {
            try {
                String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
                String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
                for (int i = 0; i < extractorConfigurationFilesArr.length; i++) {
                    String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName();
                    for (Path p : DistributedCache.getLocalCacheFiles(job))
                        if (p.getName().contains(extractorConfigurationFileName))
                            extractorConfigurationFilesArr[i] = p.toString();
                }

                _tempfile = File.createTempFile("casconsumer", null);

                AggregateBuilder builder = new AggregateBuilder();
                // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class));
                builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline
                        .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */,
                                true/* create_dependencies */, true/* create_new_relations */,
                                true/* create_dependency_path */, true/*ignore_nn_relations*/,
                                5/* dependecy_path_maxlength */, false/* create_detailed_output */,
                                extractorConfigurationFilesArr/* extractor_configuration */,
                                _tempfile.getAbsolutePath()/* output_destination */)));
                _engine = builder.createAggregate();

            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        private JoBimFormat _jobim = new JoBimFormat();
        private IntWritable _count = new IntWritable();

        @Override
        public void map(LongWritable key, Text value, OutputCollector<JoBimFormat, IntWritable> output,
                Reporter reporter) throws IOException {
            String casinput = value.toString();

            // run pipeline, get results from tempfile
            JCas aJCas;
            try {
                aJCas = _engine.newJCas();
            } catch (ResourceInitializationException e) {
                LOG.error("Could not initialize cas", e);
                return;
            }
            aJCas.setDocumentText(casinput);
            DocumentMetaData meta = DocumentMetaData.create(aJCas);
            meta.setDocumentId(key.toString());

            try {
                _engine.process(aJCas);
            } catch (AnalysisEngineProcessException e) {
                LOG.error("Could not process cas", e);
                return;
            }

            LineIterator iter = new LineIterator(new FileReader(_tempfile));
            while (iter.hasNext()) {
                String line = iter.nextLine();

                String[] splits = line.split("\t");
                _jobim._jo = splits[0];
                _jobim._bim = splits[1];
                _count.set(Integer.parseInt(splits[2]));

                output.collect(_jobim, _count);
            }
            iter.close();

            try {
                _tempfile.delete();
            } catch (Exception e) {
                LOG.error("Could not delete tempfile.", e);
            }

        }

    }

    public static class GoogleSyntacticsJob2Reducer extends MapReduceBase
            implements Reducer<JoBimFormat, IntWritable, JoBimFormat, IntWritable> {
        private IntWritable _count = new IntWritable();

        @Override
        public void reduce(JoBimFormat key, Iterator<IntWritable> values,
                OutputCollector<JoBimFormat, IntWritable> output, Reporter reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            _count.set(sum);
            output.collect(key, _count);
        }

    }

    public static class JoBimFormat implements WritableComparable<JoBimFormat> {

        String _jo;
        String _bim;

        @Override
        public void readFields(DataInput in) throws IOException {
            _jo = in.readUTF();
            _bim = in.readUTF();
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(_jo);
            out.writeUTF(_bim);
        }

        @Override
        public int compareTo(JoBimFormat o) {
            return ComparisonChain.start().compare(_jo, o._jo).compare(_bim, o._bim).result();
        }

        @Override
        public boolean equals(Object obj) {
            if (obj instanceof JoBimFormat)
                return compareTo((JoBimFormat) obj) == 0;
            return false;
        }

        @Override
        public int hashCode() {
            return _jo.hashCode() ^ _bim.hashCode();
        }

        @Override
        public String toString() {
            return String.format("%s\t%s", _jo, _bim);
        }

    }

}