de.tudarmstadt.ukp.dkpro.core.RSTAnnotator.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.RSTAnnotator.java

Source

/*
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.core;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import java.io.File;
import java.io.IOException;

/**
 * Wrapper for RST parser by Feng et al., 2014 (ACL) http://www.cs.toronto.edu/~weifeng/software.html
 * <p/>
 * Note: Using {@code StanfordSegmenter} is preferred
 *
 * @author Ivan Habernal
 */
public class RSTAnnotator extends JCasAnnotator_ImplBase {
    /**
     * Path to downloaded RST parser (src/ dir)
     */
    public static final String PARAM_RST_PARSER_SRC_DIR_PATH = "rstParserSrcDirPath";

    @ConfigurationParameter(name = PARAM_RST_PARSER_SRC_DIR_PATH, mandatory = true)
    String rstParserSrcDirPath;

    /**
     * If true (default), calls "sanity_check.py" from RST parser during initialization
     */
    public static final String PARAM_SANITY_CHECK_ON_INIT = "sanityCheckOnInit";
    @ConfigurationParameter(name = PARAM_SANITY_CHECK_ON_INIT, mandatory = true, defaultValue = "true")
    boolean sanityCheckOnInit;

    /**
     * For debug purposes; keeps temporary .tree files in /tmp
     */
    public static final String PARAM_KEEP_TMP_FILES = "keepTmpFiles";
    @ConfigurationParameter(name = PARAM_KEEP_TMP_FILES, mandatory = true, defaultValue = "false")
    boolean keepTmpFiles;

    /**
     * For debugging purposes; logs output of the RST parser
     */
    public static final String PARAM_DEBUG_RST_OUTPUT = "debugRSTOutput";
    @ConfigurationParameter(name = PARAM_DEBUG_RST_OUTPUT, mandatory = true, defaultValue = "false")
    boolean debugRSTOutput;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        // perform sanity check
        if (sanityCheckOnInit) {
            File rstParserSrcDir = new File(rstParserSrcDirPath);

            // create process
            ProcessBuilder processBuilder = new ProcessBuilder().inheritIO();

            // working dir must be set to the src dir of RST parser
            processBuilder.directory(rstParserSrcDir);

            // run the command
            processBuilder.command("python", new File(rstParserSrcDir, "sanity_check.py").getAbsolutePath());

            try {
                Process process = processBuilder.start();

                // and wait
                int returnValue = process.waitFor();

                if (returnValue != 0) {
                    throw new RuntimeException("Process exited with code " + returnValue);
                }

            } catch (IOException | InterruptedException e) {
                throw new ResourceInitializationException(e);
            }
        }
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        try {
            // parse
            String parse = parseWithRST(aJCas.getDocumentText());

            if (parse != null) {
                // annotate
                RSTParseOutputReader reader = new RSTParseOutputReader();
                reader.readParseOutput(parse, aJCas);
            }
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
    }

    /**
     * Runs the parser on the given text
     *
     * @param originalText text
     * @return parse tree
     * @throws IOException exception
     */
    public String parseWithRST(String originalText) throws IOException {
        // temporary file in
        File tmpFileIn = File.createTempFile("rst_tmp", ".txt");
        // output of RST parser is a .tree file
        File tmpFileOut = new File(tmpFileIn.getAbsolutePath() + ".tree");
        // tmp log
        File tmpFileLog = new File(tmpFileIn.getAbsolutePath() + ".log");

        try {
            // write the text into a temporary file
            FileUtils.writeStringToFile(tmpFileIn, originalText);

            String tmpDirName = System.getProperty("java.io.tmpdir");

            File rstParserSrcDir = new File(rstParserSrcDirPath);

            // create process
            ProcessBuilder processBuilder = new ProcessBuilder().inheritIO();

            // log to file
            processBuilder.redirectErrorStream(true);
            processBuilder.redirectOutput(ProcessBuilder.Redirect.to(tmpFileLog));

            // working dir must be set to the src dir of RST parser
            processBuilder.directory(rstParserSrcDir);

            // run the command
            processBuilder.command("python", new File(rstParserSrcDir, "parse.py").getAbsolutePath(), "-t",
                    tmpDirName, tmpFileIn.getAbsolutePath(), "-g");
            Process process = processBuilder.start();

            // and wait
            int returnValue = process.waitFor();

            if (returnValue != 0) {
                throw new RuntimeException("Process exited with code " + returnValue);
            }

            // read the log
            if (this.debugRSTOutput) {
                getLogger().debug(FileUtils.readFileToString(tmpFileLog));
            }

            // read the output
            if (tmpFileOut.exists()) {
                return FileUtils.readFileToString(tmpFileOut);
            }
        } catch (InterruptedException e) {
            throw new IOException(e);
        } finally {
            // clean up
            if (!keepTmpFiles) {
                FileUtils.deleteQuietly(tmpFileIn);
                FileUtils.deleteQuietly(tmpFileOut);
                FileUtils.deleteQuietly(tmpFileLog);
            }
        }

        return null;
    }
}