de.tudarmstadt.ukp.dkpro.wsd.linkbased.algorithm.WikipediaRelatednessMethod.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.wsd.linkbased.algorithm.WikipediaRelatednessMethod.java

Source

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package de.tudarmstadt.ukp.dkpro.wsd.linkbased.algorithm;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.collections.ListUtils;
import org.apache.log4j.Logger;

import de.tudarmstadt.ukp.dkpro.wsd.algorithm.AbstractWSDAlgorithm;
import de.tudarmstadt.ukp.dkpro.wsd.algorithm.WSDAlgorithmCollectiveBasic;
import de.tudarmstadt.ukp.dkpro.wsd.si.SenseInventory;
import de.tudarmstadt.ukp.dkpro.wsd.si.SenseInventoryException;
import de.tudarmstadt.ukp.dkpro.wsd.si.linkdatabase.LinkDatabaseInventoryResource;

/**
 * An disambiguation algorithm based on the relatedness of two links, which is based on the formula by Milne & Witten
 *
 * @author nico.erbs@gmail.com
 *
 */
public class WikipediaRelatednessMethod extends AbstractWSDAlgorithm implements WSDAlgorithmCollectiveBasic {

    public WikipediaRelatednessMethod(SenseInventory inventory) {
        super(inventory);
    }

    private final static Logger logger = Logger.getLogger(WikipediaRelatednessMethod.class.getName());

    private final Map<String, List<String>> incomingLinksList = new HashMap<String, List<String>>();

    private Set<String> possibleCandidates;

    @Override
    public Map<String, Map<String, Double>> getDisambiguation(Collection<String> sods)
            throws SenseInventoryException {
        Map<String, Map<String, Double>> disambiguationResults = new HashMap<String, Map<String, Double>>();

        logger.info("Get all possible senses");
        possibleCandidates = new HashSet<String>();
        List<String> senses;
        HashMap<String, Double> disambiguations;
        for (String sod : sods) {
            senses = inventory.getSenses(sod);

            possibleCandidates.addAll(senses);

            disambiguations = new HashMap<String, Double>();
            for (String sense : senses) {
                disambiguations.put(sense, 0d);
            }
            disambiguationResults.put(sod, disambiguations);
        }

        logger.info("Computing incoming links");

        for (String candidate : possibleCandidates) {
            logger.info("Compute incoming links for " + candidate);
            if (!incomingLinksList.containsKey(candidate)) {
                incomingLinksList.put(candidate,
                        ((LinkDatabaseInventoryResource) inventory).getIncomingLinks(candidate));
            }
        }

        //create list with disambiguation results
        for (String sod : disambiguationResults.keySet()) {
            for (String candidate : disambiguationResults.get(sod).keySet()) {
                disambiguationResults.get(sod).put(candidate, computeWikipediaLinkMeasure(candidate));
            }
        }

        return disambiguationResults;
    }

    private Integer countIncomingSharedLinks(String candidate0, String candidate1) {

        return ListUtils.intersection(incomingLinksList.get(candidate0), incomingLinksList.get(candidate1)).size();
    }

    private Double computeWikipediaLinkMeasure(String candidate) {
        logger.info("Computing score for " + candidate);

        double relatedness = 0;
        //start with -1 because a sense own value will be zero
        int counter = -1;
        int maxIncomingLinks;
        int minIncomingLinks;
        int numberOfSenses = ((LinkDatabaseInventoryResource) inventory).getNumberOfSenses();
        for (String otherCandidate : possibleCandidates) {
            maxIncomingLinks = Math.max(incomingLinksList.get(candidate).size(),
                    incomingLinksList.get(otherCandidate).size());
            minIncomingLinks = Math.min(incomingLinksList.get(candidate).size(),
                    incomingLinksList.get(otherCandidate).size());
            counter++;
            relatedness += (Math.log(maxIncomingLinks)
                    - Math.log(countIncomingSharedLinks(candidate, otherCandidate)))
                    / (Math.log(numberOfSenses) - Math.log(minIncomingLinks));
        }

        //Formula by Milne & Witten
        //      relatedness (a, b ) =
        //         log(max ( A , B ))  log( A  B )
        //         log( W )  log(min( A , B ))

        return relatedness / counter;
    }

}