es.ehu.si.ixa.pipe.parse.Annotate.java Source code

Introduction

Here is the source code for es.ehu.si.ixa.pipe.parse.Annotate.java
Source

/*
 * Copyright 2013 Rodrigo Agerri
    
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    
   http://www.apache.org/licenses/LICENSE-2.0
    
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */

package es.ehu.si.ixa.pipe.parse;

import ixa.kaflib.KAFDocument;
import ixa.kaflib.WF;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;

import es.ehu.si.ixa.pipe.parse.heads.HeadFinder;

import opennlp.tools.parser.Parse;

/**
 * 
 * Class to provide parsing annotation in various forms: KAF, Penn style, and
 * with headWords marked. It also loads the right model for each language.
 * 
 * @author ragerri
 * @version 2014-02-03
 * 
 */
public class Annotate {

    private boolean MARKHEADS;
    private ConstituentParsing parser;
    private HeadFinder headFinder;

    /**
     * Constructor that takes into account lang options (en|es) loads the
     * corresponding parse model and decides whether to mark headWords or not.
     * 
     * @param lang
     */
    public Annotate(String lang, String model) {
        parser = new ConstituentParsing(lang, model);
        MARKHEADS = false;
    }

    /**
     * Constructor that takes lang options (en|es) and a headFinder as parameters
     * and loads the corresponding parse model; it also states whether headWords
     * should be marked.
     * 
     * @param lang
     * @param headFinder
     */
    public Annotate(String lang, String model, HeadFinder headFinder) {
        parser = new ConstituentParsing(lang, model);
        this.headFinder = headFinder;
        MARKHEADS = true;
    }

    /**
     * It takes an array of tokens and outputs a string with tokens joined by a
     * whitespace.
     * 
     * @param array
     *          of tokens
     * @return string representing one sentence for each array
     */
    private String getSentenceFromTokens(String[] tokens) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < tokens.length; i++) {
            sb.append(tokens[i]).append(" ");
        }
        String sentence = sb.toString();
        return sentence;
    }

    /**
     * @param kaf
     *          document containing <text> and <terms> elements
     * @return StringBuffer containing the Parse tree
     * @throws IOException
     */
    private StringBuffer getParse(KAFDocument kaf) throws IOException {
        StringBuffer parsingDoc = new StringBuffer();
        List<List<WF>> sentences = kaf.getSentences();
        for (List<WF> sentence : sentences) {
            // get array of token forms from a list of WF objects
            String[] tokens = new String[sentence.size()];
            for (int i = 0; i < sentence.size(); i++) {
                tokens[i] = sentence.get(i).getForm();
            }
            // Constituent Parsing
            String sent = getSentenceFromTokens(tokens);
            Parse[] parsedSentence = parser.parse(sent, 1);

            if (MARKHEADS) {
                for (Parse parse : parsedSentence) {
                    headFinder.printHeads(parse);
                }
            }
            for (Parse parsedSent : parsedSentence) {
                parsedSent.show(parsingDoc);
                parsingDoc.append("\n");
            }
        }
        return parsingDoc;
    }

    /**
     * It takes a KAF document calls to getParse() and outputs the parse tree as
     * KAF constituents elements
     * 
     * @param KAF
     *          document containing <text> and <terms> elements
     * @return KAF <constituents> elements
     * @throws IOException
     */
    public void parseToKAF(KAFDocument kaf) throws IOException {
        StringBuffer parsingDoc = getParse(kaf);
        try {
            kaf.addConstituencyFromParentheses(parsingDoc.toString());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * @param KAF
     *          document containing <text> and <terms> elements
     * @return parse tree into plain text
     * @throws IOException
     */
    public String parse(KAFDocument kaf) throws IOException {
        StringBuffer parsingDoc = getParse(kaf);
        return parsingDoc.toString();
    }

    /**
     * Takes as input a list of parse strings, one for line, and annotates the
     * headwords
     * 
     * @param inputTrees
     * @return a list of parse trees with headwords annotated
     */
    private String addHeadWordsToTreebank(List<String> inputTrees) {
        StringBuffer parsedDoc = new StringBuffer();
        for (String parseSent : inputTrees) {
            Parse parsedSentence = Parse.parseParse(parseSent);
            headFinder.printHeads(parsedSentence);
            parsedSentence.show(parsedDoc);
            parsedDoc.append("\n");
        }
        return parsedDoc.toString();
    }

    public void parseForTesting(File inputText) throws IOException {
        StringBuffer parsingDoc = new StringBuffer();
        if (inputText.isFile()) {
            List<String> inputTrees = FileUtils.readLines(inputText, "UTF-8");
            for (String sentence : inputTrees) {
                Parse parsedSentence = parser.parse(sentence, 1)[0];
                parsedSentence.show(parsingDoc);
                parsingDoc.append("\n");
            }
            File outfile = new File(FilenameUtils.removeExtension(inputText.getPath()) + ".test");
            System.err.println("Writing test parse file to " + outfile);
            FileUtils.writeStringToFile(outfile, parsingDoc.toString(), "UTF-8");
        } else {
            System.out.println("Choose a correct file!");
            System.exit(1);
        }
    }

    /**
     * Takes a file containing Penn Treebank oneline annotation and annotates the
     * headwords, saving it to a file with the *.th extension. Optionally also
     * processes recursively an input directory adding heads only to the files
     * with the files with the specified extension
     * 
     * @param dir
     *          the input file or directory
     * @param ext
     *          the extension to look for in the directory
     * @throws IOException
     */
    public void processTreebankWithHeadWords(File dir, String ext) throws IOException {
        // process one file
        if (dir.isFile()) {
            List<String> inputTrees = FileUtils.readLines(new File(dir.getCanonicalPath()), "UTF-8");
            File outfile = new File(FilenameUtils.removeExtension(dir.getPath()) + ".th");
            String outTree = addHeadWordsToTreebank(inputTrees);
            FileUtils.writeStringToFile(outfile, outTree, "UTF-8");
            System.err.println(">> Wrote headWords to Penn Treebank to " + outfile);
        } else {
            // recursively process directories
            File listFile[] = dir.listFiles();
            if (listFile != null) {
                if (ext == null) {
                    System.out.println(
                            "For recursive directory processing of treebank files specify the extension of the files containing the syntactic trees.");
                    System.exit(1);
                }
                for (int i = 0; i < listFile.length; i++) {
                    if (listFile[i].isDirectory()) {
                        processTreebankWithHeadWords(listFile[i], ext);
                    } else {
                        try {
                            List<String> inputTrees = FileUtils.readLines(
                                    new File(FilenameUtils.removeExtension(listFile[i].getCanonicalPath()) + ext),
                                    "UTF-8");
                            File outfile = new File(FilenameUtils.removeExtension(listFile[i].getPath()) + ".th");
                            String outTree = addHeadWordsToTreebank(inputTrees);
                            FileUtils.writeStringToFile(outfile, outTree, "UTF-8");
                            System.err.println(">> Wrote headWords to " + outfile);
                        } catch (FileNotFoundException noFile) {
                            continue;
                        }
                    }
                }
            }
        }
    }
}