de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJob.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJob.java
Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;

import jobimtext.holing.extractor.JobimAnnotationExtractor;
import jobimtext.holing.extractor.JobimExtractorConfiguration;
import jobimtext.holing.type.JoBim;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ComparisonChain;

import de.tudarmstadt.lt.n2n.annotators.JoBimPrinter;
import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline;
import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS;
import de.tudarmstadt.lt.utilities.types.RepeatedSentence;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;

/**
 * 
 * @author Steffen Remus
 */
public class GoogleSyntacticsJob extends Configured implements Tool {

    private static Logger LOG = LoggerFactory.getLogger(GoogleSyntacticsJob.class);

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new GoogleSyntacticsJob(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob.class);
        conf.setJobName(GoogleSyntacticsJob.class.getSimpleName());

        conf.setMapperClass(GoogleSyntacticsJob3Mapper.class);
        conf.setReducerClass(GoogleSyntacticsJob3Reducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setOutputKeyClass(JoBimFormat.class);
        conf.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        // delete output path for testing purposes
        // FileSystem.get(conf).delete(new Path(args[1]), true);

        String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
        if (extractorConfigurationFiles == null) {
            extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
            System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                    SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
            conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        }

        String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
        for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
            DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

        JobClient.runJob(conf);
        return 0;
    }

    public static class GoogleSyntacticsJob3Mapper extends MapReduceBase
            implements Mapper<LongWritable, Text, JoBimFormat, IntWritable> {

        AnalysisEngine _engine;
        JobimAnnotationExtractor[] _extractors;

        @Override
        public void configure(JobConf job) {
            try {
                String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
                String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
                for (int i = 0; i < extractorConfigurationFilesArr.length; i++) {
                    String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName();
                    for (Path p : DistributedCache.getLocalCacheFiles(job))
                        if (p.getName().contains(extractorConfigurationFileName))
                            extractorConfigurationFilesArr[i] = p.toString();
                }

                int maxlength = job.getInt(SHARED_CONSTANTS.PARAM_MAXIMUM_PATHLENGTH, -1);

                AggregateBuilder builder = new AggregateBuilder();
                // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class));
                builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline
                        .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */,
                                true/* create_dependencies */, true/* create_new_relations */,
                                true/* create_dependency_path */, false/*ignore_nn_relations*/,
                                maxlength/* dependecy_path_maxlength (-1 for dynamic pathlength based on arc type, e.g. biarc, triarc, quadarc) */,
                                false/* create_detailed_output */, null/* extractor_configuration */,
                                null/* output_destination */)));

                _engine = builder.createAggregate();

                try {
                    _extractors = new JobimAnnotationExtractor[extractorConfigurationFilesArr.length];
                    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
                        _extractors[i] = JobimExtractorConfiguration
                                .getExtractorFromXmlFile(new File(extractorConfigurationFilesArr[i]).getName());
                } catch (Exception e) {
                    throw new ResourceInitializationException(e);
                }

            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        private JoBimFormat _jobim = new JoBimFormat();
        private IntWritable _count = new IntWritable();

        @Override
        public void map(LongWritable key, Text value, OutputCollector<JoBimFormat, IntWritable> output,
                Reporter reporter) throws IOException {

            reporter.progress();

            FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
            String filename = fileSplit.getPath().getName();
            String id = filename + ":" + key;

            String casinput = value.toString();

            // run pipeline, get results from tempfile
            JCas aJCas;
            try {
                aJCas = _engine.newJCas();
            } catch (ResourceInitializationException e) {
                LOG.error("Could not initialize cas", e);
                return;
            }
            aJCas.setDocumentText(casinput);
            DocumentMetaData meta = DocumentMetaData.create(aJCas);
            meta.setDocumentId(id);
            meta.setCollectionId(filename);

            try {
                _engine.process(aJCas);
            } catch (AnalysisEngineProcessException e) {
                LOG.error("Could not process cas", e);
                return;
            }

            reporter.progress();

            Collection<RepeatedSentence> covering_annotations = JCasUtil.select(aJCas, RepeatedSentence.class);
            for (RepeatedSentence covering_annotation : covering_annotations) {
                int repetitions = covering_annotation.getRepetitionCount();
                for (JoBim jb : JoBimPrinter.getJoBims(covering_annotation, false)) {
                    for (JobimAnnotationExtractor extractor : _extractors) {
                        _jobim._jo = extractor.extractKey(jb);
                        _jobim._bim = extractor.extractValues(jb);
                        _count.set(repetitions + 1);
                        output.collect(_jobim, _count);
                    }
                }
            }
        }
    }

    public static class GoogleSyntacticsJob3Reducer extends MapReduceBase
            implements Reducer<JoBimFormat, IntWritable, JoBimFormat, IntWritable> {
        private IntWritable _count = new IntWritable();

        @Override
        public void reduce(JoBimFormat key, Iterator<IntWritable> values,
                OutputCollector<JoBimFormat, IntWritable> output, Reporter reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            _count.set(sum);
            output.collect(key, _count);
        }

    }

    public static class JoBimFormat implements WritableComparable<JoBimFormat> {

        String _jo;
        String _bim;

        @Override
        public void readFields(DataInput in) throws IOException {
            _jo = in.readUTF();
            _bim = in.readUTF();
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(_jo);
            out.writeUTF(_bim);
        }

        @Override
        public int compareTo(JoBimFormat o) {
            return ComparisonChain.start().compare(_jo, o._jo).compare(_bim, o._bim).result();
        }

        @Override
        public boolean equals(Object obj) {
            if (obj instanceof JoBimFormat)
                return compareTo((JoBimFormat) obj) == 0;
            return false;
        }

        @Override
        public int hashCode() {
            return _jo.hashCode() ^ _bim.hashCode();
        }

        @Override
        public String toString() {
            return String.format("%s\t%s", _jo, _bim);
        }

    }

}