de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJobDkbd.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJobDkbd.java
Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.hadoop;

import java.io.File;
import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.ToolRunner;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.resource.ResourceInitializationException;

import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline;
import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS;
import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat.DocumentTextExtractor;

/**
 * 
 * @author Steffen Remus
 */
public class GoogleSyntacticsJobDkbd extends DkproHadoopDriver {

    static void print_usage(String message) {
        if (message != null && !"".equals(message))
            System.out.println(message);
        // Usage: HadoopPipe [hadoop-params] input output [num-mappers]

        System.out.format(
                "Usage: ... %s -D%s=<extractor-configuration-file1>,<extractor-configuration-file2>,... input output [num-mappers]  %n",
                GoogleSyntacticsJobDkbd.class.getName(), SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    }

    public static void main(String[] args) {
        try {
            ToolRunner.run(new Configuration(), new GoogleSyntacticsJobDkbd(), args);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException {
        try {
            String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
            String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
            for (int i = 0; i < extractorConfigurationFilesArr.length; i++) {
                String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName();
                for (Path p : DistributedCache.getLocalCacheFiles(conf))
                    if (p.getName().contains(extractorConfigurationFileName))
                        extractorConfigurationFilesArr[i] = p.toString();
            }

            int maxlength = conf.getInt(SHARED_CONSTANTS.PARAM_MAXIMUM_PATHLENGTH, -1);

            AggregateBuilder builder = new AggregateBuilder();
            // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class));
            builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline
                    .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */,
                            true/* create_dependencies */, true/* create_new_relations */,
                            true/* create_dependency_path */, false/*ignore_nn_relations*/,
                            maxlength/* dependecy_path_maxlength */, false/* create_detailed_output */,
                            extractorConfigurationFilesArr/* extractor_configuration */,
                            SHARED_CONSTANTS.HADOOP_CAS_CONSUMER_OUTPUT_FILENAME/* output_destination */)));
            return builder.createAggregateDescription();

        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }

    }

    @Override
    public AnalysisEngineDescription buildReducerEngine(Configuration job) throws ResourceInitializationException {
        return null;
    }

    @Override
    public Class<?> getInputFormatClass() {
        return Text2CASInputFormat.class;
    }

    @Override
    public void configure(JobConf job) {
        String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
        if (extractorConfigurationFiles == null) {
            extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
            System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                    SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
            job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        }
        try {
            String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
            for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
                DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job);
        } catch (IOException e) {
            e.printStackTrace();
        }
        Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class);
        job.setOutputFormat(NullOutputFormat.class); // ignore the serialized cas and use only the output from the CasConsumer
    }

    public static class KeyPlusValueAsDocumentExtractor implements DocumentTextExtractor {
        private static Text _text = new Text();

        @Override
        public Text extractDocumentText(Text key, Text value) {
            _text.set(key.toString() + "\t" + value.toString());
            return _text;
        }
    }
}