de.tudarmstadt.ukp.uby.integration.alignment.xml.transform.sensealignments.FnWnSenseAlignmentXml.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.uby.integration.alignment.xml.transform.sensealignments.FnWnSenseAlignmentXml.java

Source

/*******************************************************************************
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package de.tudarmstadt.ukp.uby.integration.alignment.xml.transform.sensealignments;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeMap;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.integration.alignment.xml.model.Alignments;
import de.tudarmstadt.ukp.integration.alignment.xml.model.Decision;
import de.tudarmstadt.ukp.integration.alignment.xml.model.Decisiontype;
import de.tudarmstadt.ukp.integration.alignment.xml.model.ResourceXml;
import de.tudarmstadt.ukp.integration.alignment.xml.model.Source;
import de.tudarmstadt.ukp.integration.alignment.xml.model.Target;
import de.tudarmstadt.ukp.integration.alignment.xml.model.XmlMeta;
import de.tudarmstadt.ukp.lmf.api.Uby;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.transform.DBConfig;
import de.tudarmstadt.ukp.uby.integration.alignment.xml.transform.SenseAlignmentGenericXml;

/**
 * Convert the FrameNet-WordNet alignments to UBY format. This class takes the
 * FrameNet 1.5 and WordNet 3.0 ids from a file and integrates them to UBY
 *
 * @author Silvana Hartmann
 *
 */
public class FnWnSenseAlignmentXml extends SenseAlignmentXml {

    static String UBY_HOME = System.getenv("UBY_HOME");
    static String DKPRO_HOME = System.getenv("DKPRO_HOME");
    protected static Log logger = LogFactory.getLog(FnWnSenseAlignmentXml.class);
    private final Uby uby;

    ArrayList<String> notfoundWn = null;
    ArrayList<String> notfoundFn = null;
    ArrayList<String> notAddedAll;
    int inputsize = 0;

    /**
     *
     * @param alignmentFile
     * @param outFile
     * @param dbConfig
     */
    public FnWnSenseAlignmentXml(String alignmentFile, String outFile, DBConfig dbConfig) {
        super(alignmentFile, outFile);
        uby = new Uby(dbConfig);
        notfoundFn = new ArrayList<String>();
        notfoundWn = new ArrayList<String>();
        notAddedAll = new ArrayList<String>();
    }

    /**
     * Collect UBY SenseIds for the aligned senses based on synsetId and lemma
     * for WordNet and based on lexical unit id for FrameNet
     *
     * @throws IOException
     */
    @Override
    public void toAlignmentXml(XmlMeta metadata) throws IOException {
        System.err.println("to Alignment Xml");
        TreeMap<String, Source> sourceMap = new TreeMap<>();
        List<String[]> data = null;
        data = readAlignmentFile();
        int counter = 0; // input sense pairs
        int found = 0; // output sense pairs

        // iterate over alignment entries
        for (String[] d : data) {
            counter++;
            // show progress:
            if ((counter % 1000) == 0) {
                logger.info("# processed alignments: " + counter);
            }

            // use FrameNet sense externalReference (lexical unit Id)
            String fnSenseId = d[0]; // SOURCE

            Source source = null;
            if (sourceMap.containsKey(fnSenseId)) {
                source = sourceMap.get(fnSenseId);
            } else {
                source = new Source();
            }
            source.ref = fnSenseId;
            List<Target> targets = new LinkedList<Target>();
            // get WordNet sense by Synset Offset and Lemma
            List<Sense> wnSenses = uby.getSensesByWNSynsetId(d[1]);
            // List<Sense> wnSenses = uby.wordNetSenses(partOfSpeech, offset);

            for (Sense wnSense : wnSenses) {
                Target target = new Target();
                target.ref = wnSense.getId();
                Decision decision = new Decision();
                decision.confidence = SenseAlignmentGenericXml.DEFAULTCONFSCORE;
                decision.value = true;
                // decision.src = metadata.decisiontypes.get(0).name;
                target.decision = decision;
                targets.add(target);
                found++;
            }
            if (targets.size() > 0) {
                source.targets = targets;
                sourceMap.put(source.ref, source);
            }
        }

        writer.writeMetaData(metadata);
        Alignments alignments = new Alignments();
        alignments.source = new LinkedList<>();
        alignments.source.addAll(sourceMap.values());
        writer.writeAlignments(alignments);
        writer.close();
        System.err.println("Alignments in: " + counter + " OUT" + found);

        logger.info("Alignments in: " + counter + "Alignments out: " + found);
    }

    /**
     * Read alignment file in standard format, e.g.: fn_luId, wn_synset ID,
     * wn_lemma, fn_lemma
     *
     * @return
     * @throws IOException
     */
    private List<String[]> readAlignmentFile() {
        List<String[]> alignment = new ArrayList<String[]>();
        int lineNumber = 0;
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(alignmentFile));
            String line = null;
            while ((line = reader.readLine()) != null) {
                lineNumber++;
                String[] items = line.split("\t");
                alignment.add(items);
            }
        } catch (FileNotFoundException e) {
            System.err.println("File not found: " + alignmentFile);
            e.printStackTrace();
        } catch (IOException e) {
            System.err.println("File could not be opended: " + alignmentFile);
            IOUtils.closeQuietly(reader);
        }
        inputsize = lineNumber;
        return alignment;
    }

    /**
     * Write output lines to given file
     *
     * @param outFile
     * @param lines
     * @throws IOException
     */
    protected static void writeLines(String outFile, Collection<String> lines) throws IOException {
        BufferedWriter writer = null;
        try {
            writer = new BufferedWriter(new FileWriter(new File(outFile)));
            for (String line : lines) {
                writer.write(line + "\n");
            }
        } catch (IOException e) {
            System.err.println("Exception" + e + "could not write to" + outFile);
        } finally {
            if (writer != null) {
                writer.close();
            }
        }
    }

    @Override
    public XmlMeta getDefaultXmlMeta() {
        XmlMeta metadata = new XmlMeta();
        metadata.title = "FrameNet - WordNet sense alignment from WordFrameNet ";
        metadata.creator = "http://adimen.si.ehu.es/web/WordFrameNet";
        metadata.date = "2010-03-23"; // from download
        metadata.description = "WordFrameNet: ";
        metadata.identifier = "FNWNwfn";
        metadata.publisher = "Laparra E. and Rigau G";
        metadata.rights = "http://creativecommons.org/licenses/by/3.0/";
        metadata.version = "TODO";
        ResourceXml targetResource = new ResourceXml();
        targetResource.description = "WordNet version 3.x"; // TODO check
        // matches lexiconId
        targetResource.id = "WN_Lexicon_0";
        targetResource.language = "en";
        // matches externalSystem
        targetResource.identifiertype = SenseAlignmentGenericXml.UBY_SENSE_ID;
        metadata.targetResource = targetResource;
        ResourceXml sourceResource = new ResourceXml();
        sourceResource.description = "FrameNet version 1.x"; // TODO check
        // matches lexiconId
        sourceResource.id = "FN_Lexicon_0";
        sourceResource.language = "en";
        // matches externalSystem
        sourceResource.identifiertype = "FrameNet_1.5_eng_lexicalUnit";
        metadata.sourceResource = sourceResource;
        Decisiontype type = new Decisiontype();
        type.id = "WFN_FNWN";
        type.name = "WFN_FNWN";
        type.type = Decisiontype.Decision.AUTOMATIC;
        List<Decisiontype> decisionTypes = new ArrayList<>();
        decisionTypes.add(type);
        metadata.decisiontypes = decisionTypes;
        // no separate scores given => no scoretype information
        return metadata;
    }

    public static void main(String[] args) throws Exception {
        String UBY_HOME = System.getenv("UBY_HOME");
        String alignmentFile = UBY_HOME + "/alignments/WordFrameNet_formatted.tsv";
        String outFile = UBY_HOME + "/target/wordFrameNet_newXml.xml";
        DBConfig dbConfig = new DBConfig("localhost/uby_clarin_0_7_0", "com.mysql.jdbc.Driver", "mysql", "root",
                "pass", false);
        FnWnSenseAlignmentXml al = new FnWnSenseAlignmentXml(alignmentFile, outFile, dbConfig);
        al.toAlignmentXml(al.getDefaultXmlMeta());
    }

}