de.tudarmstadt.ukp.lmf.transform.germanet.LexicalEntryGenerator.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.lmf.transform.germanet.LexicalEntryGenerator.java
Source

/**
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.tudarmstadt.ukp.lmf.transform.germanet;

import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.enums.ERelTypeMorphology;
import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation;
import de.tudarmstadt.ukp.lmf.model.morphology.Lemma;
import de.tudarmstadt.ukp.lmf.model.morphology.RelatedForm;
import de.tudarmstadt.ukp.lmf.model.semantics.PredicativeRepresentation;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticBehaviour;
import de.tuebingen.uni.sfs.germanet.api.Frame;
import de.tuebingen.uni.sfs.germanet.api.LexRel;
import de.tuebingen.uni.sfs.germanet.api.LexUnit;

/**
 * Instance of this class offers methods for creating {@link LexicalEntry}
 * out of GermaNet's data.
 */
public class LexicalEntryGenerator {

    // converter associated with this LexicalEntryGenerator
    private final GNConverter converter;

    private SenseGenerator senseGenerator;

    // running number used for creating IDs of LexicalEntries
    private int lexicalEntryNumber = 0;

    // running number used for creating IDs of SyntacitBehaviours
    private int syntacticBehaviourNumber = 0;

    // Mappings between LexicalEntries and their' corresponding LexUnit-groups
    private final HashMap<LexicalEntry, Set<LexUnit>> leLUGroupMappings = new HashMap<LexicalEntry, Set<LexUnit>>();

    // Mappings between LexUnit-groups and their' corresponding LexicalEntries
    private final Map<Set<LexUnit>, LexicalEntry> luGroupLEMappings = new HashMap<Set<LexUnit>, LexicalEntry>();

    private final Log logger = LogFactory.getLog(getClass());

    /**
     * Constructs an instance of {@link LexicalEntryGenerator}, which provides methods for creating <br>
     * LexicalEntries out of GermaNet's files
     * @param converter an instance of {@link GNConverter} associated with this generator
     * @param resourceVersion Version of the resource
     */
    public LexicalEntryGenerator(GNConverter converter, String resourceVersion) {
        this.converter = converter;
        if (senseGenerator == null) {
            senseGenerator = new SenseGenerator(converter.getGnet(), resourceVersion);
        }
    }

    /**
     * This method creates a {@link LexicalEntry} based on the
     * consumed {@link Set} of LexicalUnits
     * @param luGroup a group of LexUnits from which a LexicalEntry should be created
     * @return LexicalEntry based on consumed group of LexUnits
     */
    public LexicalEntry createLexicalEntry(Set<LexUnit> luGroup) {
        LexicalEntry lexicalEntry = new LexicalEntry();

        // Create ID
        lexicalEntry.setId(getLEID(luGroup)); // Implied

        // Create partOfSpeech
        lexicalEntry.setPartOfSpeech(getLEPOS(luGroup));

        //*** Creating Lemma ***//
        Lemma lemma = new Lemma();
        lexicalEntry.setLemma(lemma); // appending
        List<FormRepresentation> formRepresentations = getFormRepresentations(luGroup); // get all FormRepresentation for this LU
        lemma.setFormRepresentations(formRepresentations);

        //*** Creating Senses***//
        List<Sense> senses = senseGenerator.generateSenses(luGroup);

        //** Creating SyntacticBehavior (one for each LexUnit in the group)**//
        SubcategorizationFrameExtractor subcatFrameExtr = converter.getSubcategorizationFrameExtractor();
        if (lexicalEntry.getPartOfSpeech().equals(EPartOfSpeech.verb)) {
            List<SyntacticBehaviour> syntacticBehaviours = new LinkedList<SyntacticBehaviour>();

            // A SyntacticBehaviour can only be created for verbs

            LinkedList<Sense> newSenses = new LinkedList<Sense>();
            for (LexUnit lu : luGroup) {
                List<Frame> gnFrames = lu.getFrames();
                Sense sense = senseGenerator.getSynsetGenerator().getSense(lu);
                newSenses.add(sense);
                for (Frame gnFrame : gnFrames) {
                    SyntacticBehaviour syntacticBehaviour = new SyntacticBehaviour();
                    // Generating an ID
                    StringBuffer sb = new StringBuffer(32);
                    sb.append("GN_SyntacticBehaviour_").append(syntacticBehaviourNumber);
                    syntacticBehaviourNumber++;
                    syntacticBehaviour.setId(sb.toString());
                    syntacticBehaviour.setSense(sense);
                    syntacticBehaviour.setSubcategorizationFrame(
                            subcatFrameExtr.getSubcategorizationFrame(gnFrame.toString()));
                    syntacticBehaviours.add(syntacticBehaviour);

                    SemanticPredicate semanticPredicate = subcatFrameExtr.getSemanticPredicate(gnFrame.toString());
                    if (semanticPredicate != null) {

                        List<PredicativeRepresentation> predicativeRepresentations = sense
                                .getPredicativeRepresentations();
                        if (predicativeRepresentations == null) {
                            predicativeRepresentations = new LinkedList<PredicativeRepresentation>();
                        }
                        PredicativeRepresentation predicativeRepresentation = new PredicativeRepresentation();
                        predicativeRepresentation.setPredicate(semanticPredicate);
                        predicativeRepresentations.add(predicativeRepresentation);
                        sense.setPredicativeRepresentations(predicativeRepresentations);
                    }
                }
            }
            lexicalEntry.setSyntacticBehaviours(syntacticBehaviours);
            lexicalEntry.setSenses(newSenses);
        } else {
            lexicalEntry.setSenses(senses);
        }

        // record this mappings for later usage
        leLUGroupMappings.put(lexicalEntry, luGroup);
        luGroupLEMappings.put(luGroup, lexicalEntry);
        return lexicalEntry;
    }

    /**
     * Consumes an instance of a {@link LexicalEntry} and appends the associated
     * RelatedForms to it. <br>
     * This method should only be invoked after all LexicalEntries of a {@link Lexicon}
     * have been created (without RelatedForms)
     * @param lexicalEntry an instance of LexicalEntry to which RelatedForms should be appended
     * @see RelatedForm
     */
    public void setRelatedForms(LexicalEntry lexicalEntry) {
        Set<LexUnit> luGroup = leLUGroupMappings.get(lexicalEntry);
        if (luGroup.isEmpty()) {
            throw new RuntimeException("Found empty LUgroup for " + lexicalEntry.getId());
        }

        List<RelatedForm> relatedForms = new LinkedList<RelatedForm>();
        for (LexUnit lu : luGroup) {
            // Extracting derivationBaseVerb
            List<LexUnit> participles = lu.getRelatedLexUnits(LexRel.has_participle);
            if (!participles.isEmpty()) {
                for (LexUnit participle : participles) {
                    RelatedForm relatedForm = getRelatedForm(participle);
                    relatedForm.setRelType(ERelTypeMorphology.derivationBaseVerb);
                    relatedForms.add(relatedForm);
                }
            }

            // Extracting derivationBaseVerbAdj
            List<LexUnit> pertonyms = lu.getRelatedLexUnits(LexRel.has_pertainym);
            if (!pertonyms.isEmpty()) {
                for (LexUnit pertonym : pertonyms) {
                    RelatedForm relatedForm = getRelatedForm(pertonym);
                    relatedForm.setRelType(ERelTypeMorphology.derivationBaseVerbAdj);
                    relatedForms.add(relatedForm);
                }
            }
        }

        removeDuplicateRelatedForms(relatedForms);
        lexicalEntry.setRelatedForms(relatedForms);
    }

    /**
     * This method consumes an instance of {@link LexUnit} and returns
     * the corresponding instance of {@link RelatedForm}
     * @param lexUnit lexical unit for which an instance of RelatedForm class should be returned
     * @return RelatedForm which is associated with consumed lexUnit
     */
    private RelatedForm getRelatedForm(LexUnit lexUnit) {
        Set<LexUnit> targetGroup = converter.getLUGroup(lexUnit);
        if (targetGroup == null) {
            throw new RuntimeException("No LU group found for lexUnit " + lexUnit.getId());
        }

        RelatedForm relatedForm = new RelatedForm();
        relatedForm.setTargetLexicalEntry(luGroupLEMappings.get(targetGroup));
        relatedForm.setTargetSense(senseGenerator.getSense(lexUnit));
        return relatedForm;
    }

    /**
     * This method consumes a group of LexUnits and returns a {@link List} of instances of {@link FormRepresentation}, <br>
     * generated from consumed group of LexUnits
     * @param luGroup a group of LexUnits, for which the list of FormRepresentations should be generated
     * @return A list of instances of FormRepresentations from the consumed luGroup
     * @since UBY 0.1.0
     */
    private List<FormRepresentation> getFormRepresentations(Set<LexUnit> luGroup) {
        HashMap<String, String> mappings = new HashMap<String, String>(); // <orthographyName,orthographyForm>
        String orthForm = null;
        String orthVar = null;
        String oldOrthForm = null;
        String oldOrthVar = null;

        for (LexUnit lu : luGroup) {
            // Extracting orthForm
            String orthForm2 = lu.getOrthForm();
            if (orthForm == null) {
                orthForm = orthForm2;
            } else if (orthForm2 != null && !orthForm.equals(orthForm2)) {
                logger.warn("conflict, diffrent orthForm in same luGroup!");
            }

            // Extracting orthVar
            String orthVar2 = lu.getOrthVar();
            if (orthVar == null) {
                orthVar = orthVar2;
            } else if (orthVar2 != null && !orthVar.equals(orthVar2)) {
                logger.warn("conflict, diffrent orthVar in same luGroup!");
            }

            // Extracting oldOrthForm
            String oldOrthForm2 = lu.getOldOrthForm();
            if (oldOrthForm == null) {
                oldOrthForm = oldOrthForm2;
            } else if (oldOrthForm2 != null && !oldOrthForm.equals(oldOrthForm2)) {
                logger.warn("LexicalEntryGenerator: conflict, diffrent oldOrthForm in same luGroup!");
            }

            // Extracting oldOrthVar
            String oldOrthVar2 = lu.getOldOrthVar();
            if (oldOrthVar == null) {
                oldOrthVar = oldOrthVar2;
            } else if (oldOrthVar2 != null && !oldOrthVar.equals(oldOrthVar2)) {
                logger.warn("LexicalEntryGenerator: conflict, diffrent oldOrthVar in same luGroup!");
            }
        }

        // Add the mappings
        if (orthForm != null) {
            mappings.put("orthForm", orthForm);
        }
        if (orthVar != null) {
            mappings.put("orthVar", orthVar);
        }
        if (oldOrthForm != null) {
            mappings.put("oldOrthForm", oldOrthForm);
        }
        if (oldOrthVar != null) {
            mappings.put("oldOrthVar", oldOrthVar);
        }

        List<FormRepresentation> formRepresentations = new LinkedList<FormRepresentation>();
        for (String orthName : mappings.keySet()) {
            FormRepresentation formRepresentation = new FormRepresentation();
            formRepresentation.setLanguageIdentifier(ELanguageIdentifier.GERMAN);
            formRepresentation.setWrittenForm(mappings.get(orthName));
            String orthographyName = null;
            //TODO: standardize names in the model (e.g., using constants!)
            //TODO: check definition of the "variants" - IMHO, there should be only two values (new and old orthography).
            if (orthName.equals("orthForm")) {
                orthographyName = "new German orthography";
            } else if (orthName.equals("orthVar")) {
                orthographyName = "new German orthographical variant";
            } else if (orthName.equals("oldOrthForm")) {
                orthographyName = "old German orthography";
            } else if (orthName.equals("oldOrthVar")) {
                orthographyName = "old German orthographical variant";
            } else {
                throw new RuntimeException("Unknown orthography: " + orthForm);
            }
            formRepresentation.setOrthographyName(orthographyName);
            formRepresentations.add(formRepresentation);
        }
        return formRepresentations;
    }

    /**
     * Consumes a group LexicalUnits and returns {@link EPartOfSpeech} of
     * corresponding {@link LexicalEntry}
     * @param luGroup s group LexicalUnits
     * @return part of speech of the LexicalEntry
     * @see LexUnit
     */
    private EPartOfSpeech getLEPOS(Set<LexUnit> luGroup) {
        for (LexUnit lu : luGroup) {
            String wordForm = lu.getWordCategory().name();
            if (wordForm.equals("adj")) {
                return EPartOfSpeech.adjective;
            } else if (wordForm.equals("nomen")) {
                if (lu.isNamedEntity()) {
                    return EPartOfSpeech.nounProper;
                } else {
                    return EPartOfSpeech.nounCommon;
                }
            } else if (wordForm.equals("verben")) {
                return EPartOfSpeech.verb;
            }
        }

        throw new RuntimeException("Undefined part of speech for " + luGroup.iterator().next().getId());
    }

    /**
     * This method consumes a group of LexicalUnits and generates a unique
     * ID for corresponding {@link LexicalEntry}
     * @param luGroup a group of LexicalUnits for which LexicalEntry an id will be generated
     * @return generated id
     * @see LexUnit
     */
    private String getLEID(Set<LexUnit> luGroup) {
        LexicalEntry generatedLexicalEntry = luGroupLEMappings.get(luGroup);
        if (generatedLexicalEntry != null) {
            return generatedLexicalEntry.getId();
        } else {
            return "GN_LexicalEntry_" + (lexicalEntryNumber++);
        }
    }

    /**
     * This method consumes a list of RelatedForms and removes
     * all duplicates from the consumed list
     * @param relatedForms {@link List} of RelatedForm objects from which duplicates should be removed
     * @see {@link RelatedForm}
     * @since UBY 0.1.0
     */
    private void removeDuplicateRelatedForms(List<RelatedForm> relatedForms) {
        if (relatedForms.isEmpty())
            return;

        HashSet<RelatedForm> temp = new HashSet<RelatedForm>();
        temp.addAll(relatedForms);
        relatedForms.clear();
        relatedForms.addAll(temp);
    }

}