de.tudarmstadt.ukp.lmf.transform.germanet.InterlingualIndexConverter.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.lmf.transform.germanet.InterlingualIndexConverter.java

Source

/**
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.tudarmstadt.ukp.lmf.transform.germanet;

import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.enums.ESenseAxisType;
import de.tudarmstadt.ukp.lmf.model.meta.MetaData;
import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation;
import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis;
import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tuebingen.uni.sfs.germanet.api.EwnRel;
import de.tuebingen.uni.sfs.germanet.api.GermaNet;
import de.tuebingen.uni.sfs.germanet.api.IliRecord;
import de.tuebingen.uni.sfs.germanet.api.LexUnit;

/**
 * This class contains methods for converting the German part of the Interlingual Index (ILI),
 * provided in <a href="URL#http://www.sfs.uni-tuebingen.de/lsd/index.shtml">GermaNet</a>,
 * to {@link SenseAxis} instances.
 *
 * @since UBY 0.2.0
 *
 *
 */
public class InterlingualIndexConverter {

    private final GNConverter gnConverter;
    private final GermaNet gnet;
    private final Lexicon wordNetLexicon;
    private final List<SenseAxis> senseAxes = new ArrayList<SenseAxis>();
    private final MetaData metaData;

    // UBY-LMF synsets sorted by external reference
    private final Map<EPartOfSpeech, Map<Long, Synset>> synsetMappings = new HashMap<EPartOfSpeech, Map<Long, Synset>>();

    //  mappings between part of speech keys and part of speech specified by Uby-LMF
    private static final Map<String, EPartOfSpeech> _posKeyMappings = new HashMap<String, EPartOfSpeech>();

    static {
        // Put the POS mappings pos Key <-> EPartOfSpeech
        _posKeyMappings.put("n", EPartOfSpeech.noun);
        _posKeyMappings.put("v", EPartOfSpeech.verb);
        _posKeyMappings.put("a", EPartOfSpeech.adjective);
        _posKeyMappings.put("r", EPartOfSpeech.adverb);
    }

    //  mappings between part of speech labels and part of speech specified by Uby-LMF
    private static final Map<String, EPartOfSpeech> _posLabelMappings = new HashMap<String, EPartOfSpeech>();

    static {
        // Put the POS mappings pos label <-> EPartOfSpeech
        _posLabelMappings.put("noun", EPartOfSpeech.noun);
        _posLabelMappings.put("verb", EPartOfSpeech.verb);
        _posLabelMappings.put("adjective", EPartOfSpeech.adjective);
        _posLabelMappings.put("adverb", EPartOfSpeech.adverb);
    }

    private final Log logger = LogFactory.getLog(getClass());

    /**
     * Constructs an instance of {@link InterlingualIndexConverter} based on the consumed
     * parameters.
     *
     * @since 0.2.0
     *
     * @param gnConverter an instance of {@link GNConverter} associated with this generator
     *
     * @param gnet {@link GermaNet} instance used for accessing GermaNet data.
     * @param alignmentMetaData
     *
     * @param wordNetLexicon {@link LexicalResource} instance containing
     * <a href="URL#https://wordnet.princeton.edu/wordnet/">WordNet 3.0</a>.
     */
    public InterlingualIndexConverter(GNConverter gnConverter, GermaNet gnet, Lexicon wordNetLexicon,
            MetaData alignmentMetaData) {
        this.gnConverter = gnConverter;
        this.gnet = gnet;
        this.wordNetLexicon = wordNetLexicon;
        this.metaData = alignmentMetaData;
    }

    /**
     * Starts the conversion process of GermaNets Interlingual Index to {@link SenseAxis} instances.
     * The generated sense axes can be obtained by invoking {@link #getSenseAxes()}.
     */
    public void convert() {

        createSynsetMappings(wordNetLexicon);

        SynsetGenerator synsetGenerator = gnConverter.getSynsetGenerator();
        synsetGenerator.initialize();

        List<IliRecord> iliRecords = gnet.getIliRecords();
        int synsetAlignmentCounter = 0;
        int senseAlignmentCounter = 0;

        for (IliRecord iliRecord : iliRecords) {

            EwnRel relation = iliRecord.getEwnRelation();
            if (relation.toString().equals("synonym")) {
                /*
                 * Only synonyms are converted to sense axes
                 */
                String pwn30Id = iliRecord.getPwn30Id();
                String offsetString = pwn30Id.replaceAll("ENG30-", "");
                String[] temp = offsetString.split("-");
                offsetString = temp[0];

                EPartOfSpeech pos = getUbyPosFromKey(temp[1]);
                if (offsetString.contains("null")) {
                    logger.warn("offsetString contains null-string for " + iliRecord);
                    continue; // skip
                }

                long offset = Long.parseLong(offsetString);

                LexUnit lexUnit = gnet.getLexUnitByID(iliRecord.getLexUnitId());

                Synset gnUBYSynset = synsetGenerator.getLMFSynset(lexUnit);

                /*
                 * Obtain the UBY-LMF synset that corresponds to the WordNet 3.0 synset
                 * targeted by the ILI-record
                 */
                Synset wnUBYSynset = synsetMappings.get(pos).get(offset);

                if (wnUBYSynset == null) {
                    logger.warn(
                            "Synset for the given WordNet word could not be found. SenseAxis will not be generated."
                                    + iliRecord);
                    continue; // skip
                } else {
                    /*
                     * Create SenseAxis for Synset
                     */
                    SenseAxis senseAxisSynset = new SenseAxis();
                    senseAxisSynset.setSynsetOne(gnUBYSynset);
                    senseAxisSynset.setSynsetTwo(wnUBYSynset);
                    senseAxisSynset.setSenseAxisType(ESenseAxisType.crosslingualSenseAlignment);
                    senseAxisSynset.setId("GN_WN_Synset_Alignment_Interlingual_Index_" + synsetAlignmentCounter++);
                    senseAxisSynset.setLexiconOne(gnConverter.getLexicalResource().getLexicons().get(0)); //available after calling toLMF?
                    senseAxisSynset.setLexiconTwo(wordNetLexicon);
                    senseAxisSynset.setMetaData(metaData);
                    senseAxes.add(senseAxisSynset);
                }

                /*
                 * Create SenseAxis for Sense
                 */
                String pwnWord = iliRecord.getPwnWord();

                Sense gnUBYSense = gnConverter.getSynsetGenerator().getSense(lexUnit);
                Sense wnUBYSense = getSense(wnUBYSynset, pwnWord);

                if (wnUBYSense == null) {
                    logger.warn("Sense for the given WordNet word ##" + pwnWord
                            + "## could not be found. SenseAxis will not be generated." + iliRecord);
                    continue; // skip
                } else {
                    SenseAxis senseAxisSense = new SenseAxis();
                    senseAxisSense.setSenseOne(gnUBYSense);
                    senseAxisSense.setSenseTwo(wnUBYSense);
                    senseAxisSense.setSenseAxisType(ESenseAxisType.crosslingualSenseAlignment);
                    senseAxisSense.setId("GN_WN_Sense_Alignment_Interlingual_Index_" + senseAlignmentCounter++);
                    senseAxisSense.setLexiconOne(gnConverter.getLexicalResource().getLexicons().get(0)); //available after calling toLMF?
                    senseAxisSense.setLexiconTwo(wordNetLexicon);
                    senseAxisSense.setMetaData(metaData);

                    senseAxes.add(senseAxisSense);
                }

            }
        }
    }

    /**
     * Consumes a {@link Synset} instance that corresponds to a WordNet 3.0 synset and
     * a {@link String} representation of a word. It returns the
     * first {@link Sense} instance of the consumed UBY-LMF Synset, that belongs to a
     * LexicalEntry which has a lemma equal to the consumed word.
     *
     * @param wnUBYSynset synset that contains the senses to be queried
     *
     * @param pwnWord the returned must belong to a lexical entry with lemma equal to pwnWord
     *
     * @return sense that corresponds to the consumed word, or null if no sense in the
     * consumed synset belongs to a lexical entry with lemma that is equal to pwnWord
     */
    private Sense getSense(Synset wnUBYSynset, String pwnWord) {
        List<Sense> senses = wnUBYSynset.getSenses();
        for (Sense sense : senses) {
            for (FormRepresentation formRepresentation : sense.getLexicalEntry().getLemma()
                    .getFormRepresentations()) {
                if (formRepresentation.getWrittenForm().equals(pwnWord)) {
                    return sense;
                }
            }
        }
        return null;
    }

    /**
     * Initializes {@link #synsetMappings} field. The field makes an efficient search for a
     * {@link Synset} possible, for a given {@link EPartOfSpeech} and WordNet 3.0 synset offset.
     *
     * @param wordNetLexicon {@link Lexicon} used for extracting the mappings
     */
    private void createSynsetMappings(Lexicon wordNetLexicon) {

        List<Synset> synsets = wordNetLexicon.getSynsets();

        synsetMappings.put(EPartOfSpeech.noun, new HashMap<Long, Synset>());
        synsetMappings.put(EPartOfSpeech.verb, new HashMap<Long, Synset>());
        synsetMappings.put(EPartOfSpeech.adjective, new HashMap<Long, Synset>());
        synsetMappings.put(EPartOfSpeech.adverb, new HashMap<Long, Synset>());

        for (Synset synset : synsets) {
            MonolingualExternalRef monolingualExternalRef = synset.getMonolingualExternalRefs().get(0);
            String posOffset = monolingualExternalRef.getExternalReference();
            String[] temp = posOffset.split("]");

            EPartOfSpeech pos = getUbyPosFromLabel(temp[0].split(" ")[1]);

            String stringOffset = temp[1].trim();
            if (stringOffset.contains("null")) {
                logger.warn("stringOffset contains null-string");
                continue; // skip
            }

            long offset = Long.parseLong(stringOffset);
            Map<Long, Synset> mapping = synsetMappings.get(pos);
            mapping.put(offset, synset);
        }
    }

    /**
     * This method consumes a {@link String}
     * and returns corresponding {@link EPartOfSpeech}
     * @param POS string as key, e.g. n, v, a
     * @return associated part of speech defined in UBY-LMF
     * @since 0.7.0
     */
    private static EPartOfSpeech getUbyPosFromKey(String pos) {
        EPartOfSpeech result = _posKeyMappings.get(pos);
        return result;
    }

    /**
     * This method consumes a {@link String}
     * and returns corresponding {@link EPartOfSpeech}
     * @param POS string as label, e.g. noun, verb
     * @return associated part of speech defined in UBY-LMF
     * @since 0.7.0
     */
    private static EPartOfSpeech getUbyPosFromLabel(String pos) {
        EPartOfSpeech result = _posLabelMappings.get(pos);
        return result;
    }

    /**
     * Returns the {@link List} of all {@link SenseAxis} instances, generated by this {@link InterlingualIndexConverter}.
     *
     * @return a list of sense axes generated by this converter, or an empty list if the converter
     * has not generated any sense axes
     */
    public List<SenseAxis> getSenseAxes() {
        return this.senseAxes;
    }

    //TODO fill in data
    public static MetaData getDefaultMetaData() {
        /*
         * Generate Metadata
         */
        MetaData m = new MetaData();
        m.setAutomatic(false);
        DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
        Date creationDate;
        try {
            creationDate = formatter.parse("2014-04-01");
            m.setCreationDate(creationDate);
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        m.setCreationProcess("semi-automatic");
        m.setCreationTool("http://www.sfs.uni-tuebingen.de/GermaNet/ili.shtml");
        m.setVersion("GN 9.0");
        m.setId("GNWN_ILI_0");//TODO
        return m;
    }

}