de.tudarmstadt.informatik.lt.sogaardparser.MWEData.java Source code

Introduction

Here is the source code for de.tudarmstadt.informatik.lt.sogaardparser.MWEData.java
Source

package de.tudarmstadt.informatik.lt.sogaardparser;

/*
 * #%L
 * JUnsupervisedParser
 * %%
 * Copyright (C) 2016 Tim Feuerbach
 * %%
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 * #L%
 */

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.commons.lang.ObjectUtils;
import org.apache.commons.lang.StringUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.logging.Logger;

/**
 * A mapping from multi-word expressions to their druid scores obtained from some corpus.
 */
public class MWEData {
    private Map<String, Double> entries;

    private MWEData(Map<String, Double> entries) {
        this.entries = entries;
    }

    public static MWEData empty() {
        return new MWEData(Collections.<String, Double>emptyMap());
    }

    /**
     * Reads in a file in DRUID data format, discards any unigrams, and stores multi word expressions together with
     * their druid score.
     */
    public MWEData(String filename, double minScore) throws IOException {
        this(Paths.get(filename), minScore);
    }

    /**
     * Reads in a file in DRUID data format, discards any unigrams, and stores multi word expressions together with
     * their druid score.
     *
     * @param file     the MWE file
     * @param minScore the minimum druid score to load a MWE
     */
    public MWEData(Path file, double minScore) throws IOException {
        entries = new HashMap<>();
        try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) {
            String line = null;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\t");
                if (Integer.valueOf(parts[0]) > 1 && Double.valueOf(parts[2]) >= minScore) {
                    entries.put(parts[1], Double.valueOf(parts[2]));
                }
            }
        }
        if (entries.isEmpty()) {
            Logger.getLogger(this.getClass().getName())
                    .warning("Found no multiword expressions in file " + file.toString());
        }
    }

    /**
     * Returns the druid score for the possible multi-word expression, constructed using the given tokens. If the
     * MWE is not in the database, 0.0 will be returned.
     */
    public double getScore(String... words) {
        return (double) ObjectUtils.defaultIfNull(entries.get(StringUtils.join(words, " ")), 0.0);
    }

    /**
     * Creates a mapping from token id's to their muli-word-expression memberships. If multi-word-expression detection
     * is disabled, each token will receive an empty set. Otherwise, will scan with an increasing window size (from 2
     * till mweMaxTokens) and add any multi-word expression found as a new id to each member token's set.
     * <p/>
     * Example: "Barack Obama visited France" yields a list ({0}, {0}, {}, {}) indicating that "Barack" and "Obama"
     * are both members of the same MWE.
     *
     * @param tokens       a sentence
     * @param mweMaxTokens the window size
     * @param mweMinScore  the minimum druid score
     */
    List<Set<Integer>> createMWESets(List<Token> tokens, int mweMaxTokens, double mweMinScore) {
        ArrayList<Set<Integer>> mweMemberships = new ArrayList<>(tokens.size());
        for (int i = 0; i < tokens.size(); i++) {
            mweMemberships.add(i, new HashSet<Integer>());
        }

        if (!entries.isEmpty()) {
            int mweId = 0;
            for (int windowSize = 2; windowSize <= mweMaxTokens; windowSize++) {
                for (int i = 0; i + windowSize <= tokens.size(); i++) {
                    String[] possibleMWE = new String[windowSize];
                    for (int j = 0; j < windowSize; j++) {
                        possibleMWE[j] = tokens.get(i + j).getCoveredText();
                    }
                    if (getScore(possibleMWE) >= mweMinScore) {
                        for (int j = 0; j < windowSize; j++) {
                            mweMemberships.get(i + j).add(mweId);
                        }
                        mweId++;
                    }
                }
            }
        }

        return mweMemberships;
    }
}