de.tudarmstadt.ukp.lmf.transform.germanet.SubcategorizationFrameExtractor.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.lmf.transform.germanet.SubcategorizationFrameExtractor.java

Source

/**
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.tudarmstadt.ukp.lmf.transform.germanet;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.lmf.model.enums.ECase;
import de.tudarmstadt.ukp.lmf.model.enums.EComplementizer;
import de.tudarmstadt.ukp.lmf.model.enums.EDeterminer;
import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalFunction;
import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalNumber;
import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticCategory;
import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticProperty;
import de.tudarmstadt.ukp.lmf.model.enums.ETense;
import de.tudarmstadt.ukp.lmf.model.enums.EVerbForm;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticArgument;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.semantics.SynSemArgMap;
import de.tudarmstadt.ukp.lmf.model.semantics.SynSemCorrespondence;
import de.tudarmstadt.ukp.lmf.model.syntax.LexemeProperty;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticArgument;

/**
 * This class extracts subcategorization frames of GermaNet 7.0
 * using a custom subcategorization mapping file.
 */
public class SubcategorizationFrameExtractor {

    private final InputStream subcatStream; // The file containing subcatMappings
    private int subcatFrameNumber = 0;
    private int syntacticArgumentNumber = 0; // Running number for creating ID's of SyntacticArguments
    private final Map<String, SubcategorizationFrame> codeFrameMappings = new HashMap<String, SubcategorizationFrame>();
    private final Map<String, SemanticPredicate> codePredMappings = new HashMap<String, SemanticPredicate>();

    private final Map<String, String> codeSynSemArgMapping = new HashMap<String, String>();
    private final Map<String, String> synSemArgSynArgMapping = new HashMap<String, String>();
    private final Map<String, SubcategorizationFrame> synArgSubcatFrameMapping = new HashMap<String, SubcategorizationFrame>();

    private final List<SemanticPredicate> semanticPredicates = new LinkedList<SemanticPredicate>(); // List of all SemanticPredicates
    private int semanticPredicateNumber = 0;
    private int semanticArgumentNumber = 0;

    private final List<SynSemCorrespondence> synSemCorrespondences = new LinkedList<SynSemCorrespondence>();
    private int synSemCorrespondenceNumber = 0; // Running number for creating IDs

    // Mapping between LMF-Code of arguments (syntactic + semantic roles) and SynSemCorrespondence
    private final Map<String, SynSemCorrespondence> synsemargsSynSemCorrMap = new HashMap<String, SynSemCorrespondence>();

    private final Log logger = LogFactory.getLog(getClass());

    /**
     * Constructs a {@link SubcategorizationFrameExtractor}
     * @param subcatStream stream of the File containing the SubcategorizationFrame-mappings
     */
    public SubcategorizationFrameExtractor(InputStream subcatStream) {
        this.subcatStream = subcatStream;
        parseSubcatMappings();
    }

    /**
     * This method consumes a frame
     * and returns it's corresponding instance of {@link SubcategorizationFrame} class
     * @param frame a frame for which subcategorization frame should be returned
     * @return subcategorization frame associated with the consumed frame
     */
    public SubcategorizationFrame getSubcategorizationFrame(String frame) {
        return codeFrameMappings.get(frame);
    }

    /**
     * This method consumes a frame string-representation
     * and returns it's corresponding instance of {@link SemanticPredicate} class
     * @param frame a frame for which semantic predicate should be returned
     * @return semantic predicate associated with the consumed frame
     */
    public SemanticPredicate getSemanticPredicate(String frame) {
        return codePredMappings.get(frame);
    }

    /**
     * Returns a sorted list of all subcategorization frames contained in this extractor
     * @return all subcategorization frames in this extractor
     * @see SubcategorizationFrame
     * @see SubcategorizationFrameExtractor
     */
    public List<SubcategorizationFrame> getSubcategorizationFrames() {
        LinkedList<SubcategorizationFrame> result = new LinkedList<SubcategorizationFrame>();
        result.addAll(synArgSubcatFrameMapping.values());
        Collections.sort(result);
        return result;
    }

    /**
     * This method returns all semantic predicates generated by this extractor
     * @return a list of all semantic predicates generated by this extractor
     * @see SemanticPredicate
     * @see SubcategorizationFrameExtractor
     */
    public List<SemanticPredicate> getSemanticPredicates() {
        return semanticPredicates;
    }

    /**
     * This method returns all instances of {@link SynSemCorrespondence} class, generated by this extractor
     * @return syntactic-semantic correspondences generated by this extractor.
     */
    public List<SynSemCorrespondence> getSynSemCorrespondences() {
        return synSemCorrespondences;
    }

    /**
     * This method parses the file containing the SubcategorizationFrame-mappings
     */
    private void parseSubcatMappings() {
        logger.info("Parsing subcategorization mappings...");

        try {
            BufferedReader input = new BufferedReader(new InputStreamReader(subcatStream));
            String line;
            while ((line = input.readLine()) != null) {
                if (!line.startsWith("#")) {
                    parseLine(line);
                }
            }
            input.close();
        } catch (IOException e) {
            throw new RuntimeException("Error on reading subcategorization mapping file", e);
        }

        logger.info("Parsing subcategorization mappings done");

        // create SubcategorizationFrames
        Iterator<String> codeIterator = codeSynSemArgMapping.keySet().iterator();
        while (codeIterator.hasNext()) {
            String code = codeIterator.next();
            String synSemArgs = codeSynSemArgMapping.get(code);
            String synArgs = synSemArgSynArgMapping.get(synSemArgs);

            if (!synArgSubcatFrameMapping.containsKey(synArgs)) {
                SubcategorizationFrame subcategorizationFrame = new SubcategorizationFrame();
                subcategorizationFrame
                        .setId("GN_SubcategorizationFrame_".concat(Integer.toString(subcatFrameNumber)));
                subcategorizationFrame.setSubcatLabel(code);
                subcatFrameNumber++;
                subcategorizationFrame = parseArguments(synSemArgs, subcategorizationFrame);

                synArgSubcatFrameMapping.put(synArgs, subcategorizationFrame);
                codeFrameMappings.put(code, subcategorizationFrame);
                if (synSemArgs.contains("semanticRole")) { //only few GN-frames specify a semantic role
                    SemanticPredicate semanticPredicate = new SemanticPredicate();
                    semanticPredicate = parseSemanticArguments(synSemArgs, subcategorizationFrame);
                    codePredMappings.put(code, semanticPredicate);
                    semanticPredicates.add(semanticPredicate);
                }
            } else {

                SubcategorizationFrame subcategorizationFrame = synArgSubcatFrameMapping.get(synArgs);
                codeFrameMappings.put(code, subcategorizationFrame);

                if (synSemArgs.contains("semanticRole")) { //only few GN-frames specify a semantic role
                    SemanticPredicate semanticPredicate = new SemanticPredicate();
                    semanticPredicate = parseSemanticArguments(synSemArgs, subcategorizationFrame);
                    codePredMappings.put(code, semanticPredicate);
                    semanticPredicates.add(semanticPredicate);
                }
            }
        }
    }

    /**
     * This method parses a line of SubcatMappings-file
     * Line of SubcatMappings-file has the form: {@literal <CODE>%<Arg>:..:<Arg>}
     * @param line line of SubcatMappings-file
     */
    private void parseLine(String line) {
        String[] parts = line.split("%");
        codeSynSemArgMapping.put(parts[0], parts[1]);

        if (parts[1].contains("semanticRole")) {
            String synArgs = parts[1].replaceFirst(",semanticRole=[a-z]+", "");
            synSemArgSynArgMapping.put(parts[1], synArgs);

        } else {
            synSemArgSynArgMapping.put(parts[1], parts[1]);
        }
    }

    /**
     * This method parses syntactic arguments encoded in a line of subcategorization mapping file
     * @param synSemArgs part of the line encoding the arguments
     * @param subcatFrame subcategorization frame to which syntactic arguments should be appended
     * @return subcategorization frame with appended syntactic arguments
     * @see SubcategorizationFrame
     * @see SyntacticArgument
     */
    private SubcategorizationFrame parseArguments(String synSemArgs, SubcategorizationFrame subcatFrame) {
        SubcategorizationFrame scFrame = subcatFrame;
        List<SyntacticArgument> synArgs = new LinkedList<SyntacticArgument>();
        String[] args = synSemArgs.split(":");
        for (String arg : args) {
            if (!arg.contains("syntacticProperty")) {
                SyntacticArgument syntacticArgument = new SyntacticArgument();
                syntacticArgument.setId("GN_SyntacticArgument_".concat(Integer.toString(syntacticArgumentNumber)));
                syntacticArgumentNumber++;
                String[] atts = arg.split(",");
                for (String att : atts) {
                    String[] splits = att.split("=");
                    String attName = splits[0];
                    if (attName.equals("grammaticalFunction")) {
                        // needs some extra care because of incomplete names in the mappings-file...
                        String gf = splits[1];
                        if (gf.endsWith("Comp")) {
                            gf = gf.concat("lement");
                        }
                        syntacticArgument.setGrammaticalFunction(EGrammaticalFunction.valueOf(gf));
                    } else if (attName.equals("syntacticCategory")) {
                        syntacticArgument.setSyntacticCategory(ESyntacticCategory.valueOf(splits[1]));
                    } else if (attName.equals("optional")) {
                        syntacticArgument.setOptional(splits[1].equals("yes"));
                    } else if (attName.equals("case")) {
                        syntacticArgument.setCase(ECase.valueOf(splits[1]));
                    } else if (attName.equals("determiner")) {
                        syntacticArgument.setDeterminer(EDeterminer.valueOf(splits[1]));
                    } else if (attName.equals("preposition")) {
                        syntacticArgument.setPreposition(splits[1]);
                    } else if (attName.equals("prepositionType")) {
                        syntacticArgument.setPrepositionType(splits[1]);
                    } else if (attName.equals("number")) {
                        syntacticArgument.setNumber(EGrammaticalNumber.valueOf(splits[1]));
                    } else if (attName.equals("lex")) {
                        syntacticArgument.setLexeme(splits[1]);
                    } else if (attName.equals("verbForm")) {
                        syntacticArgument.setVerbForm(EVerbForm.valueOf(splits[1]));
                    } else if (attName.equals("tense")) {
                        syntacticArgument.setTense(ETense.valueOf(splits[1]));
                    } else if (attName.equals("complementizer")) {
                        syntacticArgument.setComplementizer(EComplementizer.valueOf(splits[1]));
                    }
                }
                synArgs.add(syntacticArgument);
            } else {
                String[] splits = arg.split("=");
                String sp = splits[1];
                if (sp.equals("raising")) {
                    sp = sp.replaceAll("raising", "subjectRaising");
                }
                LexemeProperty lexemeProperty = new LexemeProperty();
                lexemeProperty.setSyntacticProperty(ESyntacticProperty.valueOf(sp));
                scFrame.setLexemeProperty(lexemeProperty);
            }
        }
        scFrame.setSyntacticArguments(synArgs);
        return scFrame;
    }

    /**
     * This method consumes the part of the line of subcategorization mapping file encoding semantic arguments. <br>
     * It parses the arguments and returns an instance of {@link SemanticPredicate} class containing the arguments
     * @param synSemArgs part of the line encoding semantic arguments.
     * @param subcategorizationFrame instance of {@link SubcategorizationFrame} class used for creating
     * instances of {@link SynSemArgMap} class
     * @return semantic predicate containing parsed semantic arguments
     */
    private SemanticPredicate parseSemanticArguments(String synSemArgs,
            SubcategorizationFrame subcategorizationFrame) {
        // list of mappings between Syntactic and Semantic Arguments is to be created
        SemanticPredicate semanticPredicate = new SemanticPredicate();
        semanticPredicate.setId("GN_SemanticPredicate_".concat(Integer.toString(semanticPredicateNumber)));
        semanticPredicateNumber++;
        List<SemanticArgument> semanticArguments = new LinkedList<SemanticArgument>();
        List<SynSemArgMap> synSemArgMaps = new LinkedList<SynSemArgMap>();
        SynSemArgMap synSemArgMap = new SynSemArgMap();

        String[] args = synSemArgs.split(":");
        int index = 0;
        // iterate over syntactic Arguments
        for (SyntacticArgument synArg : subcategorizationFrame.getSyntacticArguments()) {
            String synsemArg = args[index];
            if (synsemArg.contains("syntacticProperty")) {
                index++;
                synsemArg = args[index];
            }
            // look at synsemArg: is semantic role defined? if yes: create corresponding semanticArg
            String[] atts = synsemArg.split(",");
            for (String att : atts) {
                String[] splits = att.split("=");
                String attName = splits[0];
                if (attName.equals("semanticRole")) {
                    SemanticArgument semanticArgument = new SemanticArgument();
                    semanticArgument.setId("GN_SemanticArgument_".concat(Integer.toString(semanticArgumentNumber)));
                    semanticArgumentNumber++;
                    semanticArgument.setSemanticRole(splits[1]);
                    semanticArguments.add(semanticArgument);
                    // Generate SynSemArgMapping
                    synSemArgMap.setSyntacticArgument(synArg);
                    synSemArgMap.setSemanticArgument(semanticArgument);
                    synSemArgMaps.add(synSemArgMap);
                }
            }
            index++;
        }
        semanticPredicate.setSemanticArguments(semanticArguments);

        SynSemCorrespondence synSemCorrespondence = new SynSemCorrespondence();
        synSemCorrespondence.setId("GN_SynSemCorrespondence_".concat(Integer.toString(synSemCorrespondenceNumber)));
        synSemCorrespondenceNumber++;
        synSemCorrespondence.setSynSemArgMaps(synSemArgMaps);
        synSemCorrespondences.add(synSemCorrespondence);
        synsemargsSynSemCorrMap.put(synSemArgs, synSemCorrespondence);
        return semanticPredicate;
    }

}