de.tudarmstadt.ukp.dkpro.lexsemresource.graph.AdjMatrixRandomWalkJGraphT.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.lexsemresource.graph.AdjMatrixRandomWalkJGraphT.java
Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package de.tudarmstadt.ukp.dkpro.lexsemresource.graph;

import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jgrapht.DirectedGraph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;

import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity;
import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource;
import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity.PoS;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.LexicalSemanticResourceException;

/**
 * A random walk in the graph.
 *
 * In Random Walk (RW) sampling, we uniformly at random pick a starting node and then simulate a random walk
 * on the graph. At every step with probability c = 0.15 (the value commonly used in literature) we fly back
 * to the starting node and re-start the random walk. There is problem of getting stuck, for example, if the
 * starting node is a sink, and/or it belongs to a small, isolated component. The solution is: If, after a
 * very long number of steps, we do not visit enough nodes to meet the required sample size, we select another
 * starting node and repeat the procedure. In our experiments we run the random walk for 100 * n steps.
 * (Leskovec and Faloutsos, 2006)
 *
 * @author garoufi
 *
 */

public class AdjMatrixRandomWalkJGraphT {

    private Log logger = LogFactory.getLog(getClass());
    public DirectedGraph<Entity, DefaultEdge> entityGraph;
    private LexicalSemanticResource lexSemResource;
    PersistentAdjacencyMatrix adjMatrix;
    private Entity startEntity;
    int resourceSize = 0;
    int graphSize = 0;

    /**
     * A random walk graph from a resource with a desired size
     * @param resource
     * @param size The size of the RW graph. It takes a double value from 0 to 1 (exclusively) that corresponds
     * to the fraction of the original graph's size that we want the sample to have. Values around .15 should
     * be OK (Leskovec and Faloutsos, 2006)
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
     */
    public AdjMatrixRandomWalkJGraphT(LexicalSemanticResource resource, double size)
            throws LexicalSemanticResourceException {

        // size of a sample must be between 0 and 1
        if (size <= 0 || size >= 1) {
            logger.error("Requested sample size not acceptable. Setting the sample size to 15%");
            size = .15;
        }

        adjMatrix = new PersistentAdjacencyMatrix(resource);

        this.resourceSize = adjMatrix.resourceSize;
        this.lexSemResource = resource;
        this.entityGraph = createRWGraph(resource, size);
    }

    /**
     * Create a random walk graph from a resource with a desired size
     * @param resource
     * @param size The size of the RW graph
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
     */
    private DirectedGraph<Entity, DefaultEdge> createRWGraph(LexicalSemanticResource resource, double size)
            throws LexicalSemanticResourceException {

        DirectedGraph<Entity, DefaultEdge> graph = new DefaultDirectedGraph<Entity, DefaultEdge>(DefaultEdge.class);

        // for Wikipedia-AG-DE: start with node "Weltfrieden":
        Map<String, String> startEntityLexemes = new HashMap<String, String>();
        startEntityLexemes.put("Weltfriede", "-");
        startEntityLexemes.put("Weltfrieden", "-");
        startEntity = resource.getEntity(startEntityLexemes, PoS.unk);

        // output resource and sample size
        int sampleSize = (int) (resourceSize * size);
        logger.info("The size of the resource is " + resourceSize);
        logger.info("The size of the desired sample is " + sampleSize);

        graph.addVertex(startEntity);
        logger.info("Starting the RW with node: " + startEntity);

        //      // pause for a few sec so that i read the output:
        //      Long stoptime = 5000L;
        //      try {
        //      Thread.sleep(stoptime);
        //      } catch (InterruptedException e) {
        //      e.printStackTrace();
        //      }

        Entity entity = startEntity;
        int numIterations = 100 * resourceSize;
        graphSize = 1;
        int progress = 0;

        // run the random walk:
        for (int i = 0; i < numIterations; i++) {

            progress = 100 * i / numIterations;
            if (i % 10000 == 0) {
                logger.info("Sample size progress: " + graphSize + " (" + (100 * graphSize / sampleSize)
                        + "%). Iteration progress: " + progress + "%");
            }

            // pick a random number from 0 to 1 in order to implement nondeterminism:
            double r = Math.random();

            // check if you got enough nodes:
            if (graphSize >= sampleSize) {
                logger.info("Desired sample size reached! Breaking the loop.");
                break;
            }

            // with probability 85% burn child and make it the current node:
            else if (r > 0.15) {
                entity = burnChild(entity, graph);
            }

            // with probability 15% fly back to start:
            else {
                entity = startEntity;
            }
        }
        return graph;
    }

    /**
     * Randomly picks a child of the given source node, and burns it, i.e. puts it into the graph, along with
     * the corresponding edge
     * @param source
     * @param graph
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
     * @return The child burned, if the source has children. The source itself, otherwise.
     */
    private Entity burnChild(Entity source, DirectedGraph<Entity, DefaultEdge> graph)
            throws LexicalSemanticResourceException {

        Set<Entity> children = adjMatrix.getAdjacencies(source);

        int numChildren = children.size();
        Entity child = null;
        Random generator = new Random();

        if (numChildren != 0) {

            // randomly select one of the children:
            int pickedChild = generator.nextInt(numChildren);
            child = (Entity) children.toArray()[pickedChild];

            // put child and edge into the graph, if not there yet, and return the child
            if (!graph.containsVertex(child)) {
                graph.addVertex(child);
                //logger.info(child + " added in the RW graph.");
                graphSize++;
            }
            if (!graph.containsEdge(source, child)) {
                graph.addEdge(source, child);
            }
            return child;
        } else {
            return source;
        }
    }
}