LVCoref.MMAX2.java Source code

Java tutorial

Introduction

Here is the source code for LVCoref.MMAX2.java

Source

/*******************************************************************************
 * Copyright 2013,2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Artrs Znoti
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *******************************************************************************/
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package LVCoref;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.lang3.StringEscapeUtils;
import org.w3c.dom.NodeList;

/**
 *
 * @author Artrs
 */
public class MMAX2 {
    public static void exportMentions(Document d, String filename) {
        BufferedWriter writer = null;
        try {
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"));
        } catch (java.io.IOException ex) {
            ex.printStackTrace();
        }
        Utils.toWriter(writer,
                "<?xml version=\"1.0\" encoding=\"" + "UTF-8" + "\"?>\n"
                        + "<!DOCTYPE markables SYSTEM \"markables.dtd\">\n"
                        + "<markables xmlns=\"www.eml.org/NameSpaces/" + "coref" + "\">\n");
        for (Mention m : d.mentions) {
            String span = "";
            if (m.start == m.end)
                span = "word_" + (m.start + 1);
            else
                span = "word_" + (m.start + 1) + ".." + "word_" + (m.end + 1);
            String coref_class = "";
            if (d.corefClusters.get(m.corefClusterID).corefMentions.size() > 1)
                coref_class = "set_" + m.corefClusterID;
            else
                coref_class = "empty";
            String category = "other";
            if (m.category != null) {
                if (m.category.equals("ORG"))
                    category = "organization";
                else if (m.category.equals("LOCATION"))
                    category = "location";
                else if (m.category.equals("PERSON"))
                    category = "person";
            }

            Utils.toWriter(writer,
                    "<markable id=\"markable_" + (m.id + 1) + "\" span=\"" + span + "\" coref_class=\""
                            + coref_class + "\" category=\"" + category
                            + "\" mmax_level=\"coref\"  rule=\"none\"  type=\"none\"/>\n");
        }
        Utils.toWriter(writer, "</markables>");
        try {
            writer.flush();
            writer.close();
        } catch (IOException ex) {
            Logger.getLogger(Document.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static void exportWords(Document d, String filename) {
        BufferedWriter writer = null;
        try {
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"));
        } catch (java.io.IOException ex) {
            ex.printStackTrace();
        }
        Utils.toWriter(writer, "<?xml version=\"1.0\" encoding=\"" + "UTF-8" + "\"?>\n"
                + "<!DOCTYPE words SYSTEM \"words.dtd\">\n" + "<words>\n");

        for (Node n : d.tree) {
            Utils.toWriter(writer,
                    "<word id=\"word_" + (n.id + 1) + "\">" + StringEscapeUtils.escapeXml(n.word) + "</word>\n");
        }
        Utils.toWriter(writer, "</words>");
        try {
            writer.flush();
            writer.close();
        } catch (IOException ex) {
            Logger.getLogger(Document.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static void exportSentences(Document d, String filename) {
        BufferedWriter writer = null;
        try {
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"));
        } catch (java.io.IOException ex) {
            ex.printStackTrace();
        }
        Utils.toWriter(writer, "<?xml version=\"1.0\" encoding=\"" + "UTF-8"
                + "\"?>\n<!DOCTYPE markables SYSTEM \"markables.dtd\">\n<markables xmlns=\"www.eml.org/NameSpaces/"
                + "sentence" + "\">\n");

        int start = -1;
        int end;
        int sentence_id = 0;
        for (Node n : d.tree) {
            if (n.sentStart)
                start = n.id + 1;
            if (n.sentEnd) {
                end = n.id + 1;
                sentence_id++;

                String span = "word_" + start;
                if (end != start)
                    span += "..word_" + end;

                Utils.toWriter(writer, "<markable mmax_level=\"sentence\" id=\"markable_" + sentence_id
                        + "\" span=\"" + span + "\" />\n");
            }
        }
        Utils.toWriter(writer, "</markables>");
        try {
            writer.flush();
            writer.close();
        } catch (IOException ex) {
            Logger.getLogger(Document.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * 
     * @param filename
     * @param path with trailing "/"
     */
    public static void createProject(Document d, String project, String path) {
        String words = path + project + "_words.xml";
        String coref_level = path + project + "_coref_level.xml";
        String sentence_level = path + project + "_sentence_level.xml";
        String project_file = path + project + ".mmax";

        exportWords(d, words);
        exportMentions(d, coref_level);
        exportSentences(d, sentence_level);

        //-----Create project file
        BufferedWriter writer = null;
        try {
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(project_file), "UTF-8"));
        } catch (java.io.IOException ex) {
            ex.printStackTrace();
        }
        Utils.toWriter(writer,
                "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<mmax_project>\n" + "<words>" + project
                        + "_words.xml" + "</words>\n" + "<keyactions></keyactions>\n" + "<gestures></gestures>\n"
                        + "</mmax_project>\n");
        try {
            writer.flush();
            writer.close();
        } catch (IOException ex) {
            Logger.getLogger(Document.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static Boolean addMmaxNeAnnotation(Document d, String annotation_filename) {
        try {
            File mmax_file = new File(annotation_filename);
            DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();

            org.w3c.dom.Document doc = dBuilder.parse(mmax_file);
            NodeList markables = doc.getElementsByTagName("markable");

            for (int i = 0; i < markables.getLength(); i++) {
                org.w3c.dom.Node markable = markables.item(i);

                String span = markable.getAttributes().getNamedItem("span").getNodeValue();
                String category = markable.getAttributes().getNamedItem("category").getNodeValue();

                String[] intervals = span.split(",");
                String[] interval = intervals[0].split("\\.\\.");
                int start = Integer.parseInt(interval[0].substring(5)) - 1;
                int end = start;
                if (interval.length > 1) {
                    end = Integer.parseInt(interval[1].substring(5)) - 1;
                }
                //System.err.println(i+" :" + start+ "-"+end);
                //                if (category.equals("profession")) category = "person";
                //                if (category.equals("event")) continue;
                //                //if (category.equals("product")) continue;
                //                if (category.equals("media")) continue;
                //                if (category.equals("time")) continue;
                //                if (category.equals("sum")) continue;

                if (category.equals("other"))
                    continue;
                if (d.getNode(start).ne_annotation.length() != 0 || d.getNode(start).ne_annotation.length() != 0)
                    continue;

                //if (d.getNode(start).isProperByFirstLetter()) {
                for (int j = start; j <= end; j++) {
                    //System.out.println(j);
                    Node q = d.getNode(j);
                    q.ne_annotation = category;
                }
                //}
            }
        } catch (Exception e) {
            System.err.println("Error adding MMAX2 annotation:" + e.getMessage());
            return false;
        }
        return true;
    }

    public static void exportNeAnnotation(Document d, String export_filename) {
        PrintWriter out;
        try {
            String eol = System.getProperty("line.separator");
            out = new PrintWriter(new FileWriter(export_filename));
            StringBuilder s = new StringBuilder();
            for (Node n : d.tree) {
                s.append(n.conll_fields.get(1));
                s.append("\t");
                s.append(n.conll_fields.get(3).charAt(0));
                s.append("\t");
                s.append(n.conll_fields.get(2));
                s.append("\t");
                s.append(n.conll_fields.get(4));
                s.append("\t");
                if (n.ne_annotation.length() == 0)
                    n.ne_annotation = "O";
                s.append(n.ne_annotation);
                s.append(eol);
                if (n.sentEnd)
                    s.append(eol);
            }
            out.print(s.toString());
            out.flush();
            out.close();
        } catch (IOException ex) {
            Logger.getLogger(Document.class.getName()).log(Level.SEVERE, null, ex);
            System.err.println("ERROR: couldn't create/open output conll file");
        }
    }

    public static void main(String[] args) throws Exception {
        Document d = new Document();

        String mmax = "data/interview_16_coref_level.xml";
        String conll = "data/pipeline/interview_16.lvsem.conll";
        String ne_export = "data/pipeline/interview_16.ne_export.tab";

        //        String mmax = "data/interview_23_coref_level.xml";
        //        String conll = "data/pipeline/interview_23.lvsem.conll";
        //        String ne_export = "data/pipeline/interview_23.ne_export.tab";

        //        String mmax = "data/interview_27_coref_level.xml";
        //        String conll = "data/pipeline/interview_27.lvsem.conll";
        //        String ne_export = "data/pipeline/interview_27.ne_export.tab";

        //        String mmax = "data/interview_38_coref_level.xml";
        //        String conll = "data/pipeline/interview_38.lvsem.conll";
        //        String ne_export = "data/pipeline/interview_38.ne_export.tab";

        //        String mmax = "data/interview_43_coref_level.xml";
        //        String conll = "data/pipeline/interview_43.lvsem.conll";
        //        String ne_export = "data/pipeline/interview_43.ne_export.tab";

        //        String mmax = "data/interview_46_coref_level.xml";
        //        String conll = "data/pipeline/interview_46.lvsem.conll";
        //        String ne_export = "data/pipeline/interview_46.ne_export.tab";

        d.readCONLL(conll);

        addMmaxNeAnnotation(d, mmax);
        exportNeAnnotation(d, ne_export);

    }
}