de.tudarmstadt.ukp.lmf.transform.wordnet.SynsetGenerator.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.lmf.transform.wordnet.SynsetGenerator.java
Source

/**
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.tudarmstadt.ukp.lmf.transform.wordnet;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;

import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Word;
import net.sf.extjwnl.dictionary.Dictionary;
import net.sf.extjwnl.dictionary.MorphologicalProcessor;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.lmf.model.core.Definition;
import de.tudarmstadt.ukp.lmf.model.core.Statement;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
import de.tudarmstadt.ukp.lmf.model.enums.EStatementType;
import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tudarmstadt.ukp.lmf.transform.wordnet.util.WNConvUtil;

/**
 * Instance of this class offers methods for creating {@link Synset} instances
 * out of WordNet's data.
 */
public class SynsetGenerator {

    protected static class ExampleMapping {

        protected String senseKey;
        protected String lemma;
        protected int score;

        public ExampleMapping(final String senseKey, final String lemma) {
            this.senseKey = senseKey;
            this.lemma = lemma;
        }

        public String getSenseKey() {
            return senseKey;
        }

        public String getLemma() {
            return lemma;
        }

        public int getScore() {
            return score;
        }

        public void setScore(int score) {
            this.score = score;
        }

        public void addScore(final int increment) {
            this.score += increment;
        }

    }

    public final static String EXTERNAL_SYSTEM_SYNSET_OFFSET = "synsetOffset";

    private final Log logger = LogFactory.getLog(getClass());

    private final Dictionary wordnet; // WordNet Dictionary
    private MorphologicalProcessor morphProcessor;
    private final String resourceVersion;
    private boolean initialized = false;

    private final List<Synset> synsets = new ArrayList<Synset>();
    private int lmfSynsetNumber = 0; // running number used for creating IDs of Synsets

    // Mappings betweenWordNet's synsets and Uby-LMF synsets
    private final Map<net.sf.extjwnl.data.Synset, Synset> wnSynsetLMFSynsetMappings = new HashMap<net.sf.extjwnl.data.Synset, Synset>();

    // Mappings between lexemes and associated example sentences (extracted from WordNet's glosses)
    private final Map<String, List<String>> examples = new TreeMap<String, List<String>>();

    protected List<String> annotationList;
    protected int[] annotationCounter = new int[10];

    /**
     * This method constructs a {@link SynsetGenerator} based on the consumed parameters
     * @param wordnet initialized {@link Dictionary}-instance, used for accessing information encoded in WordNet's files
     * @param lexemeMappingFile the file containing manually entered mappings of example senteneces to lexemes
     * @param resourceVersion Version of the resource
     * @return SynsetGenerator
     */
    public SynsetGenerator(final Dictionary wordnet, final String resourceVersion) {
        this.wordnet = wordnet;
        this.resourceVersion = resourceVersion;
    }

    /** Transforms WordNet synsets to UBY synsets and stores the result in
     *  member variables. Initialization is done only once. */
    public void initialize() throws JWNLException {
        if (initialized) {
            return;
        }

        // Create UBY-LMF synsets.
        for (POS pos : POS.getAllPOS()) {
            logger.info("processing " + pos.getLabel());

            Iterator<net.sf.extjwnl.data.Synset> synIter = wordnet.getSynsetIterator(pos);
            while (synIter.hasNext()) {
                net.sf.extjwnl.data.Synset wnSynset = synIter.next();

                // Synset.
                Synset lmfSynset = new Synset();
                lmfSynset.setId("WN_Synset_" + lmfSynsetNumber);
                lmfSynsetNumber++;
                synsets.add(lmfSynset);
                wnSynsetLMFSynsetMappings.put(wnSynset, lmfSynset);

                // Definition.
                List<String> statementTexts = new ArrayList<String>();
                String senseDefinition = processGloss(wnSynset, lmfSynset, statementTexts);
                if (senseDefinition != null && !senseDefinition.isEmpty()) {
                    List<Definition> definitions = new LinkedList<Definition>();
                    Definition definition = new Definition();
                    definition.setTextRepresentations(
                            WNConvUtil.makeTextRepresentationList(senseDefinition, ELanguageIdentifier.ENGLISH));
                    definitions.add(definition);
                    lmfSynset.setDefinitions(definitions);

                    // Statement.
                    if (statementTexts.size() > 0) {
                        List<Statement> statements = new ArrayList<Statement>();
                        for (String statementText : statementTexts) {
                            Statement statement = new Statement();
                            statement.setStatementType(EStatementType.usageNote);
                            statement.setTextRepresentations(WNConvUtil.makeTextRepresentationList(statementText,
                                    ELanguageIdentifier.ENGLISH));
                            statements.add(statement);
                        }
                        definition.setStatements(statements);
                    }
                }

                // MonolingualExternalRef.
                MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef();
                monolingualExternalRef.setExternalSystem(resourceVersion + "_" + EXTERNAL_SYSTEM_SYNSET_OFFSET);
                monolingualExternalRef.setExternalReference(wnSynset.getPOS() + " " + wnSynset.getOffset());
                //TODO: implications?            monolingualExternalRef.setExternalReference(wnSynset.getOffset() + "-" + wnSynset.getPOS().getKey());
                List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>();
                monolingualExternalRefs.add(monolingualExternalRef);
                lmfSynset.setMonolingualExternalRefs(monolingualExternalRefs);
            }
        }

        // Write out missing annotations.
        /** /
        if (annotationList != null) {
           for (int i = 0; i < 10; i++)
        System.out.println(i + "\t" + annotationCounter[i]);
            
           try {
        logger.warn("Example disambiguation missing. Check annotations.txt");
        PrintWriter writer = new PrintWriter("annotations.txt");
        for (String annotLine : annotationList)
           writer.println(annotLine);
        writer.close();
           } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
           }
        }
        /**/

        initialized = true;
    }

    protected String processGloss(final net.sf.extjwnl.data.Synset wnSynset, final Synset lmfSynset,
            final List<String> statements) throws JWNLException {
        // Split gloss into sense definition and sense examples.
        String gloss = wnSynset.getGloss();
        String senseDefinition = "";
        String senseExamples = null;
        boolean endsWithDelim = false;
        do {
            int idx = gloss.indexOf("\"");
            if (idx >= 0) {
                senseDefinition = senseDefinition + gloss.substring(0, idx);
                gloss = gloss.substring(idx + 1);
                senseExamples = gloss;
            } else {
                senseDefinition = senseDefinition + gloss + ";";
            }
            String tmp = senseDefinition.trim();
            endsWithDelim = (";:.,)".indexOf(tmp.charAt(tmp.length() - 1)) >= 0);
            if (!endsWithDelim) {
                senseDefinition = senseDefinition + "\"";
            }
        } while (!endsWithDelim);
        senseDefinition = senseDefinition.trim();
        if (!senseDefinition.isEmpty()) {
            senseDefinition = senseDefinition.substring(0, senseDefinition.length() - 1).trim();
        }

        // Separate sense examples.
        if (senseExamples != null) {
            int idx;
            do {
                idx = senseExamples.indexOf("\"");
                if (idx >= 0) {
                    String senseExample = senseExamples.substring(0, idx);
                    processExample(wnSynset, senseExample, statements);
                    senseExamples = senseExamples.substring(idx + 1);

                    idx = senseExamples.indexOf("\"");
                    if (idx >= 0) {
                        senseExamples = senseExamples.substring(idx + 1);
                    }
                }
            } while (idx >= 0);
        }
        return senseDefinition;
    }

    protected String cleanText(final String text) {
        StringBuilder result = new StringBuilder();
        boolean wasWhitespace = false;
        for (char c : text.toCharArray()) {
            if (" \t\n\r.,!?:;()`'-".indexOf(c) >= 0) {
                if (!wasWhitespace) {
                    result.append(' ');
                }
                wasWhitespace = true;
            } else {
                result.append(Character.toLowerCase(c));
                wasWhitespace = false;
            }
        }
        return result.toString().trim();
    }

    protected void processExample(final net.sf.extjwnl.data.Synset wnSynset, final String senseExample,
            final List<String> statements) throws JWNLException {
        // Clean example and sense lemmas.
        String example = " " + cleanText(senseExample) + " ";
        List<ExampleMapping> mappings = new ArrayList<ExampleMapping>();
        for (Word word : wnSynset.getWords()) {
            mappings.add(new ExampleMapping(word.getSenseKey(), cleanText(word.getLemma())));
        }

        // Step 0: Check if there is a manual disambiguation.
        //      String senseKey = manualDisambiguation.get(wnSynset.getOffset() + wnSynset.getPOS().getKey());
        //      if (senseKey != null)
        //         saveExampleMapping(senseExample, senseKey);

        // Step 1: Check whether the lemma is a substring.
        boolean hasExactWordMatch = false;
        boolean hasPrefixMatch = false;
        for (ExampleMapping mapping : mappings) {
            String lemma = mapping.getLemma();
            int idx = example.indexOf(" " + lemma + " ");
            if (idx >= 0) {
                // Found exact or prefix match.
                mapping.setScore(3);
                hasExactWordMatch = true;
                continue;
            }
            idx = example.indexOf(" " + lemma);
            if (idx >= 0) {
                mapping.setScore(2);
                hasPrefixMatch = true;
                continue;
            }

            // Check for prefix matches for the full list of lemma tokens.
            String regEx = lemma.replace(" ", "\\S*? ") + "\\S*?";
            if (Pattern.compile(regEx).matcher(example).find()) {
                mapping.setScore(1);
                hasPrefixMatch = true;
            }
            /*boolean hasPrefixTokenMatch = true;
            List<String> lemmaTokens = segmentTokens(lemma);
            for (String lemmaToken : lemmaTokens) {
               if (example.indexOf(" " + lemmaToken) < 0) {
                  hasPrefixTokenMatch = false;
                  break;
               }
            }
            if (hasPrefixTokenMatch) {
               mapping.setScore(1);
               hasPrefixMatch = true;
            }*/
        }

        if (hasExactWordMatch) {
            saveExampleMappings(senseExample, mappings, 3, true);
            annotationCounter[0]++;
            return;
        }
        annotationCounter[1]++;
        if (hasPrefixMatch) {
            saveExampleMappings(senseExample, mappings, 1, true);
            annotationCounter[2]++;
            return;
        }
        annotationCounter[3]++;

        // Step 2: Match single word lemmas with all base forms.
        Set<String> baseForms = makeBaseFormList(example);
        boolean hasBaseFormMatch = false;
        for (ExampleMapping mapping : mappings) {
            String lemma = mapping.getLemma();
            if (baseForms.contains(lemma)) {
                mapping.addScore(1);
                hasBaseFormMatch = true;
            }
        }

        if (hasBaseFormMatch) {
            saveExampleMappings(senseExample, mappings, 1, true);
            annotationCounter[4]++;
            return;
        }
        annotationCounter[5]++;

        // Step 3: Match multi-word lemmas with all base forms.
        hasBaseFormMatch = false;
        for (ExampleMapping mapping : mappings) {
            String lemma = mapping.getLemma();
            List<String> lemmaTokens = segmentTokens(lemma);
            boolean hasMultiWordBaseFormMatch = true;
            for (String lemmaToken : lemmaTokens) {
                if (!baseForms.contains(lemmaToken)) {
                    hasMultiWordBaseFormMatch = false;
                    break;
                }
            }
            if (hasMultiWordBaseFormMatch) {
                mapping.addScore(1);
                hasBaseFormMatch = true;
            }
        }

        if (hasBaseFormMatch) {
            saveExampleMappings(senseExample, mappings, 1, false);
            annotationCounter[6]++;
            return;
        }
        annotationCounter[7]++;

        // Step 4: Find the longest prefix matches of all lemma tokens.
        int maxScore1 = 0;
        int maxScore2 = 0;
        for (ExampleMapping mapping : mappings) {
            String lemma = mapping.getLemma();
            List<String> lemmaTokens = segmentTokens(lemma);
            for (String lemmaToken : lemmaTokens) {
                // Trim each lemma token letter by letter and check for the longest
                // prefix match in the example sentence.
                int tokenLen = lemmaToken.length();
                for (int i = 0; i < tokenLen - 2; i++) {
                    String lemmaPrefix = " " + lemmaToken.substring(0, tokenLen - i);
                    if (example.indexOf(lemmaPrefix) >= 0) {
                        mapping.addScore(lemmaPrefix.length() - 1);
                        break;
                    }
                }

            }

            int score = mapping.getScore();
            if (score >= maxScore1) {
                maxScore2 = maxScore1;
                maxScore1 = score;
            } else if (score >= maxScore2) {
                maxScore2 = score;
            }
        }

        if (maxScore1 > 0 && maxScore2 == 0) {
            saveExampleMappings(senseExample, mappings, maxScore1, false);
            annotationCounter[8]++;
            return;
        }
        annotationCounter[9]++;

        // Step 5: This example requires manual disambiguation. Add it to the
        //    annotation list.
        if (annotationList == null) {
            annotationList = new ArrayList<String>();
        }
        annotationList.add(wnSynset.getOffset() + wnSynset.getPOS().getKey() + "\t" + senseExample);
        for (Word word : wnSynset.getWords()) {
            annotationList.add("\t\t" + word.getSenseKey() + "\t" + word.getLemma());
        }
        annotationList.add("");

        // Step 6: If we still have no clue about the example, add it to the
        //    statement class.
        statements.add(senseExample);
    }

    protected List<String> segmentTokens(String text) {
        List<String> result = new ArrayList<String>();
        int idx;
        String remainingString = text;
        do {
            idx = remainingString.indexOf(' ');
            String token;
            if (idx >= 0) {
                token = remainingString.substring(0, idx);
                remainingString = remainingString.substring(idx + 1);
            } else {
                token = remainingString;
            }
            result.add(token);
        } while (idx >= 0);
        return result;
    }

    protected Set<String> makeBaseFormList(final String example) throws JWNLException {
        if (morphProcessor == null) {
            morphProcessor = wordnet.getMorphologicalProcessor();
        }

        Set<String> result = new TreeSet<String>();
        int idx;
        String remainingString = example;
        do {
            idx = remainingString.indexOf(' ');
            String token;
            if (idx >= 0) {
                token = remainingString.substring(0, idx);
                remainingString = remainingString.substring(idx + 1);
            } else {
                token = remainingString;
            }

            // Generate base forms for all POS to avoid POS tagging errors.
            if (!token.isEmpty()) {
                result.add(token);
            }
            for (POS pos : POS.values()) {
                result.addAll(morphProcessor.lookupAllBaseForms(pos, token));
            }
        } while (idx >= 0);

        return result;
    }

    protected void saveExampleMappings(final String example, final List<ExampleMapping> mappings,
            final int minScore, final boolean preferLongerLemmas) {
        // Select all senses that scored at least the minimal score.
        List<ExampleMapping> selection = new ArrayList<ExampleMapping>();
        for (ExampleMapping mapping : mappings) {
            if (mapping.getScore() >= minScore) {
                selection.add(mapping);
            }
        }

        // If there are ties, prefer the longer ones.
        if (preferLongerLemmas) {
            List<ExampleMapping> temp = new ArrayList<ExampleMapping>();
            for (ExampleMapping mapping1 : selection) {
                String lemma1 = mapping1.getLemma();
                boolean select = true;
                for (ExampleMapping mapping2 : selection) {
                    String lemma2 = mapping2.getLemma();
                    if (lemma1.equals(lemma2)) {
                        continue;
                    }

                    if (lemma2.contains(lemma1)) {
                        select = false;
                        break;
                    }
                }
                if (select) {
                    temp.add(mapping1);
                }
            }
            selection = temp;
        }

        // Save the selected example mappings.
        for (ExampleMapping mapping : selection) {
            saveExampleMapping(example, mapping.getSenseKey());
        }
    }

    protected void saveExampleMapping(String example, String senseKey) {
        List<String> list = examples.get(senseKey);
        if (list == null) {
            list = new ArrayList<String>();
            examples.put(senseKey, list);
        }
        list.add(example);
    }

    /** Returns the list of all UBY synsets generated by this generator. */
    public List<Synset> getSynsets() {
        return synsets;
    }

    /**
     * This method consumes a WordNet's synset, and returns it's associated Uby-LMF synset,
     * generated by this generator.<br>
     * This method should be called after the generator has been initialized.
     * @param wnSynset WordNet's synset for which the generateed Uby-LMF synset should be returned
     * @return Uby-LMF synset associated with the consumed wnSynset
     * @see Synset
     * @see net.sf.extjwnl.data.Synset
     * @see SynsetGenerator#initialize()
     */
    public Synset getLMFSynset(net.sf.extjwnl.data.Synset wnSynset) {
        return wnSynsetLMFSynsetMappings.get(wnSynset);
    }

    /**
     * This method returns all mappings between WordNet's synsets, and corresponding Uby-LMF synsets,
     * with WordNet's synsets as keys.
     * @return synset mappings created by this generator
     * @see Synset
     * @see net.sf.extjwnl.data.Synset
     */
    Map<net.sf.extjwnl.data.Synset, Synset> getWNSynsetLMFSynsetMappings() {
        return wnSynsetLMFSynsetMappings;
    }

    /**
     * This method consumes a WordNet's lexeme and returns a list of lexeme's example-sentences, extracted by this generator<br>
     * from lexeme's synset.
     * @param lexeme a WordNet's lexeme which example sentences should be returned
     * @return lexeme's example sentences extracted by this generator
     * @see Word
     * @see net.sf.extjwnl.data.Synset
     */
    public List<String> getExamples(Word lexeme) {
        try {
            return examples.get(lexeme.getSenseKey());
        } catch (JWNLException e) {
            throw new IllegalArgumentException(e);
        }
    }

}