de.tudarmstadt.ukp.lmf.transform.germanet.GNConverter.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.lmf.transform.germanet.GNConverter.java
Source

/**
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.tudarmstadt.ukp.lmf.transform.germanet;

import java.io.InputStream;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.lmf.model.core.GlobalInformation;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.meta.MetaData;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tuebingen.uni.sfs.germanet.api.GermaNet;
import de.tuebingen.uni.sfs.germanet.api.LexUnit;
import de.tuebingen.uni.sfs.germanet.api.WordCategory;

/**
 *
 * Instance of this class converts
 * <a href="URL#http://www.sfs.uni-tuebingen.de/lsd/index.shtml">GermaNet 7.0</a>
 * to LMF-format
 * @author Zijad Maksuti
 * @author Judith Eckle-Kohler
 *
 */
public class GNConverter {

    private final GermaNet gnet; // GermaNet Object

    private final LexicalResource lexicalResource;

    private InputStream subcatStream; // subcat mapping file

    /*
     * Groups of LexUnits with equal lemma and part of speech
     */
    private Set<Set<LexUnit>> luGroups;

    private SubcategorizationFrameExtractor subcategorizationFrameExtractor;

    private final SynsetGenerator synsetGenerator;

    private final String dtd_version;

    private final Log logger = LogFactory.getLog(getClass());

    private final String resourceVersion;

    private MetaData alignmentMetaData;

    /**
     * Constructs a {@link GNConverter} based on the consumed parameters
     * @param germaNet initialized {@link GermaNet} object
     * @param lexicalResource initialized object of  {@link LexicalResource}, which will be filled with GermaNet's data
     * @param alignmentMeta MetaData of ili Alignment
     * @param resourceVersion Version of this resource
     * @param dtd_version specifies the version of the .dtd which will be written to lexicalResource
     */
    public GNConverter(GermaNet germaNet, LexicalResource lexicalResource, MetaData alignmentMeta,
            String resourceVersion, String dtd_version) {
        this.gnet = germaNet;
        this.lexicalResource = lexicalResource;
        this.alignmentMetaData = alignmentMeta;

        if (alignmentMetaData == null) {
            alignmentMetaData = InterlingualIndexConverter.getDefaultMetaData();
        }
        alignmentMetaData.setLexicalResource(this.lexicalResource);
        List<MetaData> metaList = this.lexicalResource.getMetaData();
        metaList.add(this.alignmentMetaData);
        this.lexicalResource.setMetaData(metaList);

        this.dtd_version = dtd_version;
        this.resourceVersion = resourceVersion;
        try {
            this.subcatStream = getClass().getClassLoader().getResource("GermaNetSubcatMappings/gnFrameMapping.txt")
                    .openStream();
            subcategorizationFrameExtractor = new SubcategorizationFrameExtractor(subcatStream);
        } catch (Exception e) {
            logger.error("GNConverter: unable to load subcat mapping file. Aborting all operations");
            System.exit(1);
        }

        this.synsetGenerator = new SynsetGenerator(this.gnet, resourceVersion);
    }

    /**
     * Converts the informations provided by the initialized {@link GermaNet} object to LMF-format. <br>
     * The result of the conversion can be obtained by calling {@link GNConverter#getLexicalResource()}
     */
    public void toLMF() {

        // Setting attributes of LexicalResource
        lexicalResource.setName("GermaNet");
        lexicalResource.setDtdVersion(dtd_version);

        // *** Setting GlobalInformation *** //
        GlobalInformation globalInformation = new GlobalInformation();
        globalInformation.setLabel("LMF representation of GermaNet 7.0");
        lexicalResource.setGlobalInformation(globalInformation);

        //*** Setting Lexicon (only one since GermaNet is monolingual)***//
        Lexicon lexicon = new Lexicon();
        lexicon.setLanguageIdentifier(ELanguageIdentifier.GERMAN);
        lexicon.setId("GN_Lexicon_0");
        lexicon.setName("GermaNet");
        LinkedList<Lexicon> lexicons = new LinkedList<Lexicon>();
        lexicons.add(lexicon);
        lexicalResource.setLexicons(lexicons);

        // *** Creating LexicalEntries *** //
        logger.info("Generating LexicalEntries...");
        this.groupLUs();
        LexicalEntryGenerator leGen = new LexicalEntryGenerator(this, resourceVersion);
        List<LexicalEntry> lexicalEntries = new LinkedList<LexicalEntry>();
        // Create a LexicalEntry for each luGroup
        for (Set<LexUnit> luGroup : luGroups) {
            lexicalEntries.add(leGen.createLexicalEntry(luGroup));
        }
        // Setting RelatedForms of LexicalEntries
        for (LexicalEntry lexicalEntry : lexicalEntries) {
            leGen.setRelatedForms(lexicalEntry);
        }
        // appending lexicalEntries
        lexicon.setLexicalEntries(lexicalEntries);
        StringBuffer sb = new StringBuffer(64);
        sb.append("Generated LexicalEntries: ").append(lexicalEntries.size());
        logger.info(sb.toString());
        int noVerbs = 0;
        int noVerbSenses = 0;
        for (LexicalEntry le : lexicalEntries) {
            if (le.getPartOfSpeech().equals(EPartOfSpeech.verb)) {
                noVerbs++;
                noVerbSenses = noVerbSenses + le.getSenses().size();
            }
        }
        sb = new StringBuffer(128);
        sb.append("Generated verb lemmas: ").append(noVerbs).append('\n');
        sb.append("Generated verb senses: ").append(noVerbSenses);
        logger.info(sb.toString());

        // *** Appending SubcategorizationFrames *** //
        lexicon.setSubcategorizationFrames(subcategorizationFrameExtractor.getSubcategorizationFrames());

        // *** Appending SemanticPredicates *** //
        lexicon.setSemanticPredicates(subcategorizationFrameExtractor.getSemanticPredicates());

        // *** Appending Synsets *** //
        synsetGenerator.initialize();
        List<Synset> synsets = synsetGenerator.getSynsets();
        lexicon.setSynsets(synsets);
        sb = new StringBuffer(64);
        sb.append("Generated synsets: ").append(synsets.size());
        logger.info(sb.toString());

        // *** Appending SynSemCorrespondences *** //
        lexicon.setSynSemCorrespondences(subcategorizationFrameExtractor.getSynSemCorrespondences());

    }

    /**
     *
     * @param wordNetLexicon
     */
    public void toLMF(Lexicon wordNetLexicon) {
        toLMF();
        InterlingualIndexConverter iliConverter = new InterlingualIndexConverter(this, gnet, wordNetLexicon,
                alignmentMetaData);
        iliConverter.convert();
        lexicalResource.setSenseAxes(iliConverter.getSenseAxes());
    }

    /**
     * This method groups all LexUnits by lemma and {@link WordCategory}
     * @see LexUnit
     */
    private void groupLUs() {
        if (luGroups == null) {
            luGroups = new LinkedHashSet<Set<LexUnit>>();
        }
        for (WordCategory pos : WordCategory.values()) {
            List<LexUnit> lus = gnet.getLexUnits(pos);
            Map<String, Set<LexUnit>> orthFormLUGroupMappings = new TreeMap<String, Set<LexUnit>>();
            for (LexUnit lu : lus) {
                String orthForm = lu.getOrthForm();
                Set<LexUnit> luGroup = orthFormLUGroupMappings.get(orthForm);
                if (luGroup == null) {
                    luGroup = new LinkedHashSet<LexUnit>();
                    orthFormLUGroupMappings.put(orthForm, luGroup);
                }
                luGroup.add(lu);
            }
            luGroups.addAll(orthFormLUGroupMappings.values());
        }
    }

    /**
     * This method consumes a {@link LexUnit} and returns a {@link Set} of
     * LexUnits with equal lemma and {@link WordCategory}
     * @param lexUnit an instance of LexUnit for which
     *  a Set of LexUnits with equal lemma and WordCategory should be returned
     * @return a set of LexUnits with equal lemma and WordCategory to consumed lexUnit
     */
    protected Set<LexUnit> getLUGroup(LexUnit lexUnit) {
        for (Set<LexUnit> luGroup : luGroups) {
            if (luGroup.contains(lexUnit)) {
                return luGroup;
            }
        }
        return null;
    }

    /**
     * Returns the {@link GermaNet} object associated to this {@link GNConverter}
     * @return GermaNet object associated to this GNConveter
     */
    public GermaNet getGnet() {
        return gnet;
    }

    /**
     * Returns the {@link SubcategorizationFrameExtractor} associated to this {@link GNConverter}
     * @return the SubcategorizationFrameExtractor associated to this GNConverter
     */
    public SubcategorizationFrameExtractor getSubcategorizationFrameExtractor() {
        return subcategorizationFrameExtractor;
    }

    /**
     * Returns the {@link LexicalResource} object, which contains the results of the conversion
     * @return an instance of LexicalResource, which contains the results of the conversion
     */
    public LexicalResource getLexicalResource() {
        return lexicalResource;
    }

    /**
     * Returns the {@link SynsetGenerator} instance associated with this {@link GNConverter}.
     *
     * @return the synset generator associated with this converter
     */
    public SynsetGenerator getSynsetGenerator() {
        return synsetGenerator;
    }
}