com.digitalpebble.behemoth.gate.GATECorpusGenerator.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.behemoth.gate;

import gate.Factory;
import gate.Gate;
import gate.util.GateException;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.digitalpebble.behemoth.BehemothConfiguration;
import com.digitalpebble.behemoth.BehemothDocument;

/**
 * Converts a Behemoth corpus into a XML corpus for GATE. This is used mostly
 * for displaying the documents with the GATE GUI. This is not a mapreduce job.
 **/
public class GATECorpusGenerator extends Configured implements Tool {

    private static final Logger LOG = LoggerFactory.getLogger(GATECorpusGenerator.class);

    public GATECorpusGenerator() throws GateException {
        Gate.runInSandbox(true);
        Gate.init();
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(BehemothConfiguration.create(), new GATECorpusGenerator(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {

        Options options = new Options();
        // automatically generate the help statement
        HelpFormatter formatter = new HelpFormatter();
        // create the parser
        CommandLineParser parser = new GnuParser();

        options.addOption("h", "help", false, "print this message");
        options.addOption("i", "input", true, "Behemoth corpus");
        options.addOption("o", "output", true, "local GATE XML corpus dir");

        // parse the command line arguments
        try {
            CommandLine line = parser.parse(options, args);
            String input = line.getOptionValue("i");
            String output = line.getOptionValue("o");
            if (line.hasOption("help")) {
                formatter.printHelp("GATECorpusGenerator", options);
                return 0;
            }
            if (input == null || output == null) {
                formatter.printHelp("GATECorpusGenerator", options);
                return -1;
            }
            generateXMLdocs(input, output);

        } catch (ParseException e) {
            formatter.printHelp("GATECorpusGenerator", options);
        }
        return 0;
    }

    private void generateXMLdocs(String inputf, String outputf) throws IOException {
        Path input = new Path(inputf);

        File output = new File(outputf);
        if (output.exists() && output.isFile()) {
            System.err.println("Output " + outputf + " already exists");
            return;
        }
        if (output.exists() == false)
            output.mkdirs();

        FileSystem fs = input.getFileSystem(getConf());
        FileStatus[] statuses = fs.listStatus(input);
        int count[] = { 0 };
        for (int i = 0; i < statuses.length; i++) {
            FileStatus status = statuses[i];
            Path suPath = status.getPath();
            if (suPath.getName().equals("_SUCCESS"))
                continue;
            generateXMLdocs(suPath, output, count);
        }
    }

    private void generateXMLdocs(Path input, File dir, int[] count) throws IOException {

        Configuration conf = BehemothConfiguration.create();
        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            BufferedWriter writer = null;
            gate.Document gatedocument = null;
            while (current.next(key, inputDoc)) {
                count[0]++;
                // generate a GATE document then save it to XML
                try {
                    // first put the text
                    GATEProcessor gp = new GATEProcessor(new URL("http://dummy.com"));
                    gp.setConfig(conf);
                    gatedocument = gp.generateGATEDoc(inputDoc);

                    // then save as XML
                    File outputFile = new File(dir, count[0] + ".xml");
                    if (outputFile.exists() == false)
                        outputFile.createNewFile();

                    writer = new BufferedWriter(new FileWriter(outputFile));
                    writer.write(gatedocument.toXml());

                } catch (Exception e) {
                    LOG.error("Exception on doc [" + count[0] + "] " + key.toString(), e);
                } finally {
                    if (writer != null)
                        writer.close();
                    if (gatedocument != null)
                        Factory.deleteResource(gatedocument);
                }
            }
            current.close();
        }
    }

}