Source code

Java tutorial


Here is the source code for


 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package de.tudarmstadt.ukp.dkpro.lexsemresource.graph;

import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jgrapht.DirectedGraph;
import org.jgrapht.Graph;
import org.jgrapht.UndirectedGraph;
import org.jgrapht.alg.ConnectivityInspector;
import org.jgrapht.alg.DijkstraShortestPath;
import org.jgrapht.graph.AsUndirectedGraph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;

import de.tudarmstadt.ukp.dkpro.lexsemresource.Entity;
import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource;
import de.tudarmstadt.ukp.dkpro.lexsemresource.core.util.LoggingUtils;
import de.tudarmstadt.ukp.dkpro.lexsemresource.core.util.ProgressMeter;
import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.LexicalSemanticResourceException;

 * A graph constructed from all entities in a lexical semantic resource.
 * @author zesch
public class EntityGraphJGraphT implements EntityGraph {
    private final Log logger = LogFactory.getLog(getClass());

    // the maximum depth of a path from a node to root
    // if no path was found after expanding to that depth, it is assumed that there is no path
    // should be higher than the deepest resource
    private static final int MAX_EXPANSION_DEPTH = 18;

    protected DirectedGraph<Entity, DefaultEdge> directedGraph;
    protected UndirectedGraph<Entity, DefaultEdge> undirectedGraph;

    // a map holding the degree distribution of the graph
    private final Map<Integer, Integer> degreeDistribution = new HashMap<Integer, Integer>();

    // a map from nodes to their eccentricity values
    private final Map<Entity, Integer> eccentricityMap = new HashMap<Entity, Integer>();

    // A map holding the (recursive) number of hyponyms for each node.
    // Recursive means that the hyponyms of hyponyms are also taken into account.
    private Map<String, Integer> hyponymCountMap;
    private final String hyponymCountMapFilename = "hypoCountMap";
    private boolean hyponymCountMapUseLcc = true;

    // a mapping from all nodes to a list of nodes on the path to the root
    private Map<String, List<String>> rootPathMap = null;
    private final String rootPathMapFilename = "rootPathMap";

    private File serializedGraphFile;
    private String graphId;

    private CycleHandlerJGraphT cycleHandler;

    private double averageShortestPathLength = Double.NEGATIVE_INFINITY;
    private double diameter = Double.NEGATIVE_INFINITY;
    private double clusterCoefficient = Double.NEGATIVE_INFINITY;
    private double depth = Double.NEGATIVE_INFINITY;

    private LexicalSemanticResource lexSemRes;

    protected EntityGraphJGraphT getEntityGraphJGraphT(LexicalSemanticResource aLsr)
            throws LexicalSemanticResourceException {
        return getEntityGraphJGraphT(aLsr, aLsr.getEntities(), "", aLsr.getNumberOfEntities());

    protected EntityGraphJGraphT getEntityGraphJGraphT(LexicalSemanticResource lexSemResource,
            Iterable<Entity> nodesToConsider, String nameSuffix, int numEntities)
            throws LexicalSemanticResourceException {

        lexSemRes = lexSemResource;
        cycleHandler = new CycleHandlerJGraphT(this);

        graphId = "graphSer_" + lexSemResource.getResourceName() + nameSuffix + "_"
                + lexSemResource.getResourceVersion();

        String defaultSerializedGraphLocation = graphId;
        serializedGraphFile = new File(defaultSerializedGraphLocation);
        if (serializedGraphFile.exists()) {
            try {
      "Loading entity graph: " + serializedGraphFile.getAbsolutePath());
                directedGraph = GraphSerialization.loadGraph(defaultSerializedGraphLocation);
                undirectedGraph = new AsUndirectedGraph<Entity, DefaultEdge>(directedGraph);
      "Finished loading entity graph.");
            } catch (IOException e) {
                throw new LexicalSemanticResourceException(e);
            } catch (ClassNotFoundException e) {
                throw new LexicalSemanticResourceException(e);
        } else {
  "Creating entity graph.");
            directedGraph = createGraph(lexSemResource, nodesToConsider, numEntities);
            undirectedGraph = new AsUndirectedGraph<Entity, DefaultEdge>(directedGraph);
  "Finished creating entity graph.");

            try {
                GraphSerialization.saveGraph(directedGraph, serializedGraphFile);
            } catch (IOException e) {
                throw new LexicalSemanticResourceException(e);


        return this;

     * Create a graph representation of the lexical semantic resource. The graph may contain
     * vertices that do not represent an entity in Wiktionary, but tokens that are linked in an
     * Wiktionary entry without having been created, yet.
     * @param lexSemResource
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
    private DirectedGraph<Entity, DefaultEdge> createGraph(LexicalSemanticResource lexSemResource,
            Iterable<Entity> entitiesToConsider, int numEntities) throws LexicalSemanticResourceException {
        DirectedGraph<Entity, DefaultEdge> graph = new DefaultDirectedGraph<Entity, DefaultEdge>(DefaultEdge.class);
        ProgressMeter progress = new ProgressMeter(numEntities);
        for (Entity entity : entitiesToConsider) {
            if (!graph.containsVertex(entity)) {

            Set<Entity> children = lexSemResource.getChildren(entity);
            if (children != null) {
                for (Entity child : children) {
                    if (!graph.containsVertex(child)) {
                    if (!graph.containsEdge(entity, child) && !entity.equals(child)) {
                        graph.addEdge(entity, child);

            // sometimes an API does not deliver fully symmetric relations, thus check parents, but
            // also check for already existing edges
            Set<Entity> parents = lexSemResource.getParents(entity);
            if (parents != null) {
                for (Entity parent : parents) {
                    if (!graph.containsVertex(parent)) {
                    if (!graph.containsEdge(parent, entity) && !entity.equals(parent)) {
                        graph.addEdge(parent, entity);

            if (logger.isDebugEnabled()) {
            } else if (logger.isInfoEnabled() && (progress.getCount() % 100 == 0)) {
        return graph;

     * @return Returns the graph.
     * @throws LexicalSemanticResourceException
    public DirectedGraph<Entity, DefaultEdge> getGraph() throws LexicalSemanticResourceException {
        return directedGraph;

    public int getNumberOfNodes() {
        return directedGraph.vertexSet().size();

    public int getNumberOfEdges() {
        return directedGraph.edgeSet().size();

    public boolean containsVertex(Entity vertex) {
        return directedGraph.containsVertex(vertex);

    public boolean containsEdge(Entity source, Entity target) {
        return directedGraph.containsEdge(source, target);

    public Set<Entity> getChildren(Entity vertex) {
        Set<Entity> outLinks = new HashSet<Entity>();
        if (directedGraph.containsVertex(vertex)) {
            Set<DefaultEdge> outgoingEdges = directedGraph.outgoingEdgesOf(vertex);
            for (DefaultEdge edge : outgoingEdges) {
        } else {
  "Graph does not contain vertex " + vertex);
        return outLinks;

    public Set<Entity> getParents(Entity vertex) {
        Set<Entity> inLinks = new HashSet<Entity>();
        if (directedGraph.containsVertex(vertex)) {
            Set<DefaultEdge> incomingEdges = directedGraph.incomingEdgesOf(vertex);
            for (DefaultEdge edge : incomingEdges) {
        } else {
  "Graph does not contain vertex " + vertex);
        return inLinks;

    public Set<Entity> getNeighbors(Entity vertex) {
        Set<Entity> union = new HashSet<Entity>(getChildren(vertex));
        return union;

    public int getInDegree(Entity vertex) {
        if (directedGraph.containsVertex(vertex)) {
            return directedGraph.inDegreeOf(vertex);
            // if there is no such node define the indegree as -1
        } else {
            return -1;

    public int getOutDegree(Entity vertex) {
        if (directedGraph.containsVertex(vertex)) {
            return directedGraph.outDegreeOf(vertex);
            // if there is no such node define the outdegree as -1
        } else {
            return -1;

    public int getDegree(Entity vertex) {
        // implementation without JGraphT:
        return getInDegree(vertex) + getOutDegree(vertex);

        // implementation with JGraphT:
        // The directed graph is treated as a undirected graph
        // UndirectedGraph<String, DefaultEdge> undirectedGraph = new AsUndirectedGraph<String,
        // DefaultEdge>(graph);
        // if (undirectedGraph.containsVertex(vertex))
        // return undirectedGraph.degreeOf(vertex);
        // else
        // return -1;

    public double getAverageDegree() {
        // implementation without JGraphT:
        double degreeSum = 0.0;

        // iterate over all node pairs
        Set<Entity> nodes = directedGraph.vertexSet();
        for (Entity node : nodes) {
            degreeSum += getDegree(node);

        return degreeSum / getNumberOfNodes();

    public Set<Entity> getLeaves() {
        Set<Entity> leafNodes = new HashSet<Entity>();
        for (Entity node : directedGraph.vertexSet()) {
            if (getOutDegree(node) == 0) {
        return leafNodes;

    public int getNumberOfLeaves() {
        return getLeaves().size();

    public Set<Entity> getRoots() {
        Set<Entity> rootNodes = new HashSet<Entity>();
        for (Entity node : directedGraph.vertexSet()) {
            if (getInDegree(node) == 0) {
        return rootNodes;

    public int getNumberOfRoots() {
        return getRoots().size();

    public List<Entity> getShortestPath(Entity source, Entity target, DirectionMode mode) {

        // if source == target than return a list with just one entity
        if (source.equals(target)) {
            List<Entity> resultList = new ArrayList<Entity>();
            return resultList;

        Graph<Entity, DefaultEdge> graph;
        if (mode.equals(DirectionMode.directed)) {
            graph = directedGraph;
        } else {
            graph = undirectedGraph;

        // The algorithm runs on the directed version of the graph
        // get the path from source to target
        if (graph.containsVertex(source) && graph.containsVertex(target)) {
            List<DefaultEdge> edgeList = DijkstraShortestPath.findPathBetween(graph, source, target);

            // no path between the vertices
            if (edgeList == null) {
                return null;

            List<Entity> resultList = new ArrayList<Entity>();
            for (int i = 0; i < edgeList.size(); i++) {
                if (i == 0) {
                } else {
            return resultList;
        // if either source or target are not contained in the graph define the path as null
        // this is different than the empty path, which occurs if source = target
        else {
            return null;

    public double getShortestPathLength(Entity source, Entity target, DirectionMode mode) {
        List<Entity> path = getShortestPath(source, target, mode);
        if (path != null) {
            return path.size() - 1; // we need to decrease by one, as we want to return the distance
            // in edges, not nodes
        } else {
            return Double.POSITIVE_INFINITY;

    public Iterable<Entity> getNodes() {
        return directedGraph.vertexSet();

    public Iterable<EntityGraphEdge> getEdges() {
        return new JGraphTEntityEdgeIterable(directedGraph);

    public boolean isSymmetricLink(Entity source, Entity target) {
        if (containsEdge(source, target) && containsEdge(target, source)) {
            return true;
        } else {
            return false;

    public int getNumberOfSymmetricLinks() {
        int symLinksSum = 0;

        // this can be done still more efficiently if i put the nodes in a list and remove from the
        // list current node as well as its children after each loop
        int progress = 0;

        Set<Entity> nodes = directedGraph.vertexSet();
        for (Entity source : nodes) {
            for (Entity target : getChildren(source)) {
                // ApiUtilities.printProgressInfo(progress, max, 100,
                // ApiUtilities.ProgressInfoMode.TEXT, "Counting symmetric links");

                if (isSymmetricLink(source, target)) {
        return symLinksSum / 2;

    public Set<Entity> getIsolatedNodes() {
        Set<Entity> isolatedNodes = new HashSet<Entity>();
        for (Entity node : directedGraph.vertexSet()) {
            if (getDegree(node) == 0) {
        return isolatedNodes;

    public int getNumberOfIsolatedNodes() {
        return getIsolatedNodes().size();

     * Computes the shortest path from node to all other nodes. Paths to nodes that have already
     * been the source of the shortest path computation are omitted (the path was already added to
     * the path sum). Updates the sum of shortest path lengths and the diameter of the graph. As the
     * JGraphT BreadthFirstIterator does not provide information about the distance to the start
     * node in each step, we will use our own BFS implementation.
     * @param pStartNode
     *            The start node of the search.
     * @param pShortestPathLengthSum
     *            The sum of the shortest path lengths.
     * @param pMaxPathLength
     *            The maximum path length found so far.
     * @param pWasSource
     *            A set of nodes which have been the start node of the computation process. For such
     *            nodes all path lengths have been already computed.
     * @return An array of double values. The first value is the shortestPathLengthSum The second
     *         value is the maxPathLength They are returned as an array for performance reasons. I
     *         do not want to create an object, as this function is called *very* often.
    private BigInteger[] computeShortestPathLengths(Entity pStartNode, BigInteger pBigShortestPathLengthSum,
            BigInteger pBigMaxPathLength, Set<Entity> pWasSource) {

        int pStartNodeMaxPathLength = 0;

        // a set of nodes that have already been expanded -> algorithm should expand nodes
        // monotonically and not go back
        Set<Entity> alreadyExpanded = new HashSet<Entity>();

        // a queue holding the newly discovered nodes and their distance to the start node
        List<Entity[]> queue = new ArrayList<Entity[]>();

        // initialize queue with start node
        Entity[] innerList = new Entity[2];
        innerList[0] = pStartNode; // the node
        innerList[1] = new Entity("0"); // the distance to the start node

        // while the queue is not empty
        while (!queue.isEmpty()) {
            // remove first element from queue
            Entity[] queueElement = queue.get(0);
            Entity currentNode = queueElement[0];
            Entity distance = queueElement[1];

            // if the node was not already expanded
            if (!alreadyExpanded.contains(currentNode)) {

                // the node gets expanded now

                // if the node was a source node in a previous run, we already have added this path
                if (!pWasSource.contains(currentNode)) {
                    // add the distance of this node to shortestPathLengthSum
                    // check if maxPathLength must be updated
                    int tmpDistance = new Integer(distance.getFirstLexeme());
                    pBigShortestPathLengthSum = pBigShortestPathLengthSum.add(BigInteger.valueOf(tmpDistance));

                    if (pBigMaxPathLength.compareTo(BigInteger.valueOf(tmpDistance)) == -1) {
                        pBigMaxPathLength = BigInteger.valueOf(tmpDistance);

                        //"*TEST TRUE:* pBigShortestPathLengthSum = " +
                        // pBigShortestPathLengthSum);
                // even if the node was a source node in a previous run there can be a path to other
                // nodes over this node, so go on

                // get the neighbors of the queue element
                Set<Entity> neighbors = getNeighbors(currentNode);

                // iterate over all neighbors
                for (Entity neighbor : neighbors) {
                    // if the node was not already expanded
                    if (!alreadyExpanded.contains(neighbor)) {
                        // add the node to the queue, increase node distance by one
                        Entity[] tmpList = new Entity[2];
                        tmpList[0] = neighbor;
                        Integer tmpDistance = new Integer(distance.getFirstLexeme()) + 1;
                        tmpList[1] = new Entity(tmpDistance.toString());
            pStartNodeMaxPathLength = new Integer(distance.getFirstLexeme());
        eccentricityMap.put(pStartNode, pStartNodeMaxPathLength);

        BigInteger returnArray[] = { pBigShortestPathLengthSum, pBigMaxPathLength };
        return returnArray;

     * Computes and sets the diameter, the average degree and the average shortest path length of
     * the graph. Do not call this in the constructor. May run a while. It is called in the getters,
     * if parameters are not yet initialized when retrieved.
    private void setGraphParameters() {"Setting graph parameters.");
        // The directed graph is treated as an undirected graph to compute these parameters.
        // UndirectedGraph<String, DefaultEdge> undirectedGraph = new AsUndirectedGraph<String,
        // DefaultEdge>(directedGraph);"Treating the graph as undirected.");

        // Diameter is the maximum of all shortest path lengths
        // Average shortest path length is (as the name says) the average of the shortest path
        // length between all node pairs

        BigInteger bigMaxPathLength = BigInteger.valueOf(0);
        BigInteger bigShortestPathLengthSum = BigInteger.valueOf(0);
        double degreeSum = 0.0;
        double clusterCoefficientSum = 0.0;

        // iterate over all node pairs
        Set<Entity> nodes = undirectedGraph.vertexSet();

        // a hashset of the nodes which have been the start node of the computation process
        // for such nodes all path lengths have been already computed
        Set<Entity> wasSource = new HashSet<Entity>();

        int progress = 0;
        double percent = 0.0;
        for (Entity node : nodes) {

            percent = (double) progress / nodes.size() * 100;

            if (percent % 10 == 0) {
      "Progress: " + percent);
            LoggingUtils.printProgressInfo(progress, nodes.size(), 100, LoggingUtils.ProgressInfoMode.TEXT,
                    "Getting graph parameters");

            int nodeDegree = undirectedGraph.degreeOf(node);
            degreeSum += nodeDegree;

            //"Updating degree distribution.");

            // cluster coefficient C_v of a node v is the fraction of the connections that exist
            // between the
            // neighbor nodes (k_v) of this node and all allowable connections between the neighbors
            // (k_v(k_v -1)/2)
            // for degrees 0 or 1 there is no cluster coefficient, as there can be no connections
            // between neighbors
            if (nodeDegree > 1) {
                double numberOfNeighborConnections = getNumberOfNeighborConnections(node);
                clusterCoefficientSum += (numberOfNeighborConnections / (nodeDegree * (nodeDegree - 1)));

            // Returns the new shortestPathLengthSum and the new maxPathLength.
            // They are returned as an double array for performance reasons.
            // I do not want to create an object, as this function is called *very* often
            //"Computing shortest path lengths.");
            BigInteger[] returnValues = computeShortestPathLengths(node, bigShortestPathLengthSum, bigMaxPathLength,
            bigShortestPathLengthSum = returnValues[0];
            bigMaxPathLength = returnValues[1];

            // save the info that the node was already used as the source of path computation

        if (nodes.size() > 1) {
            long denominator = nodes.size() * (nodes.size() - 1) / 2;
            this.averageShortestPathLength = bigShortestPathLengthSum.divide(BigInteger.valueOf(denominator))
            // sum of path lengths / (number of node pairs)
        } else {
            this.averageShortestPathLength = 0; // there is only one node
        this.diameter = bigMaxPathLength.doubleValue();
        this.clusterCoefficient = clusterCoefficientSum / nodes.size();

    private void updateDegreeDistribution(int nodeDegree) {
        if (degreeDistribution.containsKey(nodeDegree)) {
            //"Updating existing key in degree distribution map.");
            degreeDistribution.put(nodeDegree, (degreeDistribution.get(nodeDegree) + 1));
        } else {
            //"Creating new key in degree distribution map.");
            degreeDistribution.put(nodeDegree, 1);


     * Get the number of connections that exist between the neighbors of a node.
     * @param node
     *            The node under consideration.
     * @return The number of connections that exist between the neighbors of node.
    private int getNumberOfNeighborConnections(Entity node) {

        // The directed graph is treated as a undirected graph to compute these parameters.
        // UndirectedGraph<String, DefaultEdge> undirectedGraph = new AsUndirectedGraph<String,
        // DefaultEdge>(directedGraph);

        int numberOfConnections = 0;

        // get the set of neighbors
        Set<Entity> neighbors = getNeighbors(node);

        if (neighbors.size() > 0) {
            // for each pair of neighbors, test if there is a connection
            Object[] nodeArray = neighbors.toArray();
            // sort the Array so we can use a simple iteration with two for loops to access all
            // pairs

            for (int i = 0; i < neighbors.size(); i++) {
                Entity outerNode = (Entity) nodeArray[i];
                for (int j = i + 1; j < neighbors.size(); j++) {
                    Entity innerNode = (Entity) nodeArray[j];
                    // in case of a connection increase connection counter
                    // order of the nodes doesn't matter for undirected graphs

                    if (undirectedGraph.containsEdge(innerNode, outerNode)) {
                        //"There is a connection between the neighbors");

        // + " - " + numberOfConnections);

        return numberOfConnections;

     * Computes the average of the path lengths of all node pairs The graph is treated as an
     * undirected graph. Computing graph parameters requires touching all node pairs. Therefore, if
     * one parameter is called the others are computed as well and stored for later retrieval.
     * @return The average of the shortest path lengths between all pairs of nodes.
    public double getAverageShortestPathLength() {

        if (averageShortestPathLength < 0) { // has not been initialized
            logger.debug("Calling setGraphParameters");
            //"Sum = " + (averageShortestPathLength * (getNumberOfNodes() *
            // (getNumberOfNodes()-1) / 2)));
        return averageShortestPathLength;


     * Computes the diameter of the graph (the maximum of the shortest path length between all pairs
     * of nodes) The graph is treated as an undirected graph. Computing graph parameters requires
     * touching all node pairs. Therefore, if one parameter is called the others are computed as
     * well and stored for later retrieval.
     * @return The diameter of the graph.
    public double getDiameter() {
        if (diameter < 0) { // has not been initialized
            logger.debug("Calling setGraphParameters");
        return diameter;

    // The eccentricity of the vertex v is the maximum distance from v to any vertex.
    // That is, e(v) = max{d(v,w):w in V(G)}
    public double getEccentricity(Entity node) {
        if (eccentricityMap == null) {
            logger.debug("Calling setGraphParameters");
            return eccentricityMap.get(node);
        } else if (!eccentricityMap.containsKey(node)) {
            return Double.NaN;
        } else {
            return eccentricityMap.get(node);


    // The radius of G is the minimum eccentricity among the vertices of G.
    // Therefore, radius(G) = min{e(v):v in V(G)}.
    public double getRadius() {
        double currEcc = Double.POSITIVE_INFINITY;
        double minEcc = Double.POSITIVE_INFINITY;

        for (Entity node : getNodes()) {
            currEcc = getEccentricity(node);
            if (currEcc < minEcc) {
                minEcc = currEcc;
        return minEcc;

    // The center of G is the set of vertices of eccentricity equal to the radius.
    // Hence, center(G)={v in V(G):e(v)=radius(G)}.
    public Set<Entity> getCenter() {
        Set<Entity> center = new HashSet<Entity>();
        double radius = getRadius();

        for (Entity node : getNodes()) {
            if (getEccentricity(node) == radius) {
        return center;

     * Compute the cluster coefficient of the graph (after Watts and Strogatz 1998) Cluster
     * coefficient C is defined as the average of C_v over all edges. C_v is the fraction of the
     * connections that exist between the neighbor nodes (k_v) of a vertex v and all allowable
     * connections between the neighbors (k_v(k_v -1)/2). C_v = 2 * number of connections between /
     * k_v*(k_v -1)
     * @return The cluster coefficient.
    public double getClusterCoefficient() {
        if (clusterCoefficient < 0) { // has not been initialized
            logger.debug("Calling setGraphParameters");
        return clusterCoefficient;

     * Computes the degree distribution. The degree of a node is the number of edges that it is
     * connected with. The graph is treated as an undirected graph. Computing graph parameters
     * requires touching all node pairs. Therefore, if one is called the others are computed as well
     * and stored for later retrieval.
     * @return A map with the degree distribution of the graph.
    public Map<Integer, Integer> getDegreeDistribution() {
        if (degreeDistribution == null || degreeDistribution.size() == 0) { // has not been
            // initialized
            logger.debug("Calling setGraphParameters");
        return degreeDistribution;

     * @return Returns the largest connected component as a new graph. If the base graph already is
     *         connected, it simply returns the whole graph.
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
    public EntityGraphJGraphT getLargestConnectedComponent() throws LexicalSemanticResourceException {

        ConnectivityInspector<Entity, DefaultEdge> connectInspect = new ConnectivityInspector<Entity, DefaultEdge>(

        // if the graph is connected, simply return the whole graph
        if (connectInspect.isGraphConnected()) {
  "The original graph is connected. Returning this as the LCC.");
            return this;

        // else, get the largest connected component
        List<Set<Entity>> connectedComponentList = connectInspect.connectedSets(); + " connected components.");

        int i = 0;
        int maxSize = 0;
        Set<Entity> largestComponent = new HashSet<Entity>();
        for (Set<Entity> connectedComponent : connectedComponentList) {
            if (connectedComponent.size() > maxSize) {
                maxSize = connectedComponent.size();
                largestComponent = connectedComponent;

        double largestComponentRatio = largestComponent.size() * 100 / this.getNumberOfNodes();"Largest connected component contains " + largestComponentRatio + "% ("
                + largestComponent.size() + "/" + this.getNumberOfNodes() + ") of the nodes in the graph.");

        return getEntityGraphJGraphT(lexSemRes, largestComponent, "lcc", largestComponent.size());

     * Computes the depth of the graph, i.e. the maximum path length starting with the root node (if
     * a single root exists)
     * @return The depth of the hierarchy.
     * @throws UnsupportedOperationException
     * @throws LexicalSemanticResourceException
    private double computeDepth() throws LexicalSemanticResourceException {
        List<Entity> roots = new Stack<Entity>();
        if (roots.size() == 0) {
            logger.error("There is no root for this lexical semantic resource.");
            return Double.NaN;
        } else if (roots.size() > 1) {
            logger.warn("There are " + roots.size() + " roots for this lexical semantic resource.");
  "Trying to get root from underlying lexical semantic resource.");

            Entity root = lexSemRes.getRoot();
            if (root == null) {
                EntityGraph lcc = getLargestConnectedComponent();
                int nrOfLccNodes = lcc.getNumberOfNodes();
                int nrOfGraphNodes = this.getNumberOfNodes();

                double ratio = (double) nrOfLccNodes / (double) nrOfGraphNodes;

      "Falling back to the depth of the LCC.");

                if (ratio < 0.7) {
                    logger.warn("The largest connected component contains only " + ratio * 100
                            + "% of all nodes. Depth might not be meaningful.");

                return lcc.getDepth();
            } else {
                roots.clear(); // we know the real root, so remove the others

        Entity root = roots.get(0);
        BigInteger bigMaxPathLength = BigInteger.valueOf(0);
        BigInteger[] returnValues = computeShortestPathLengths(root, BigInteger.ZERO, bigMaxPathLength,
                new HashSet<Entity>());
        bigMaxPathLength = returnValues[1];
        return bigMaxPathLength.doubleValue();


     * This parameter is already set in the constructor as it is needed for computation of
     * relatedness values. Therefore its computation does not trigger setGraphParameters (it is too
     * slow), even if the depth is implicitly determined there, too.
     * @return The depth of the graph, i.e. the maximum path length starting with the root node (if
     *         a single one exists).
     * @throws UnsupportedOperationException
     * @throws LexicalSemanticResourceException
    public double getDepth() throws LexicalSemanticResourceException {
        if (depth < 0) { // has not been initialized
  "Computing depth of the hierarchy.");
            depth = computeDepth();
        return depth;

     * Gets the lowest common subsumer (LCS) of two nodes. The LCS of two nodes is first node on the
     * path to the root, that has both nodes as sons. Nodes that are not in the same connected
     * component as the root node are defined to have no LCS.
     * @param root
     *            The root entity.
     * @param e1
     *            The first entity.
     * @param e2
     *            The second entity.
     * @return The lowest common subsumer of the two nodes, or null if there is no LCS.
     * @throws UnsupportedOperationException
    public Entity getLCS(Entity root, Entity e1, Entity e2) throws LexicalSemanticResourceException {

        if (e1.equals(e2)) {
            return e1;

        // List<Entity> nodeList1 = getShortestPath(root, e1, DirectionMode.undirected);
        // List<Entity> nodeList2 = getShortestPath(root, e2, DirectionMode.undirected);

        List<String> nodeList1 = getRootPathMap().get(e1.getId());
        List<String> nodeList2 = getRootPathMap().get(e2.getId());

        // if one of the paths is null => return null
        if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) {
            logger.debug("One of the node lists is null or empty!");
            return null;

        // node 1 subsumes node 2 ?
        for (String tmpNode2 : nodeList2) {
            if (tmpNode2.equals(e1)) {
                return e1;

        // node 2 subsumes node 1 ?
        for (String tmpNode1 : nodeList1) {
            if (tmpNode1.equals(e2)) {
                return e2;

        // // only needed when path is returned by getShortestPath in JGraphT implementation
        // // we need to reverse, otherwise searching starts at the wrong end
        // Collections.reverse(nodeList1);
        // Collections.reverse(nodeList2);

        // they have a lcs ?
        for (String tmpNode1 : nodeList1) {
            for (String tmpNode2 : nodeList2) {
                if (tmpNode1.equals(tmpNode2)) {
                    return this.lexSemRes.getEntityById(tmpNode1);

        logger.debug("No lcs found.");

        return null;

     * Intrinsic information content (Seco Etal. 2004) allows to compute information content from
     * the structure of the taxonomy (no corpus needed). IC(n) = 1 - log( hypo(n) + 1) /
     * log(#entities) hypo(n) is the (recursive) number of hyponyms of a node n. Recursive means
     * that the hyponyms of hyponyms are also taken into account #entities is the number of entities
     * in the graph
     * @param entity
     *            The entity node for which the intrinsic information content should be returned.
     * @return The intrinsic information content for this entity node or 0.0 if the IIC could not be
     *         computed.
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
    public double getIntrinsicInformationContent(Entity entity) throws LexicalSemanticResourceException {
        double intrinsicIC = 0.0;

        Map<String, Integer> hcm = getHyponymCountMap();

        if (!hcm.containsKey(entity.getId())) {
            return intrinsicIC;

        int hyponymCount = hcm.get(entity.getId());
        int numberOfNodes = hcm.size();

        if (hyponymCount > numberOfNodes) {
            throw new LexicalSemanticResourceException("Something is wrong with the hyponymCountMap. "
                    + hyponymCount + " hyponyms, but only " + numberOfNodes + " nodes.");

        logger.debug(entity.toString() + " has " + hyponymCount + " hyponyms.");

        if (hyponymCount >= 0) {
            intrinsicIC = (1 - (Math.log(hyponymCount + 1) / Math.log(numberOfNodes)));
        return intrinsicIC;

     * Creates the hyponym map, that maps from nodes to their (recursive) number of hyponyms for
     * each node. "recursive" means that the hyponyms of hyponyms are also taken into account.
     * @throws UnsupportedOperationException
     * @throws LexicalSemanticResourceException
    private Map<String, Integer> getHyponymCountMap() throws LexicalSemanticResourceException {
        // do only create hyponymMap, if it was not already computed
        if (hyponymCountMap != null) {
            return hyponymCountMap;

        // work on the lcc, otherwise this is not going to work
        // EntityGraphJGraphT lcc = this;
        EntityGraphJGraphT lcc = this.getLargestConnectedComponent();
        int nrOfNodes = lcc.getNumberOfNodes();

        File hyponymCountMapSerializedFile = new File(
                getGraphId() + "_" + hyponymCountMapFilename + (lexSemRes.getIsCaseSensitive() ? "-cs" : "-cis"));
        hyponymCountMap = new HashMap<String, Integer>();

        if (hyponymCountMapSerializedFile.exists()) {
  "Loading saved hyponymyCountMap ...");
            hyponymCountMap = EntityGraphUtils.deserializeMap(hyponymCountMapSerializedFile);
            if (hyponymCountMap.size() != nrOfNodes) {
                throw new LexicalSemanticResourceException(
                        "HyponymCountMap does not contain an entry for each node in the graph."
                                + hyponymCountMap.size() + "/" + nrOfNodes);
  "Done loading saved hyponymyCountMap");
            return hyponymCountMap;

        hyponymCountMap = new HashMap<String, Integer>();

        // a queue holding the nodes to process
        Queue<String> queue = new LinkedList<String>();

        // In the entity graph a node may have more than one father.
        // Thus, we check whether a node was already visited.
        // Then, it is not expanded again.
        Set<String> visited = new HashSet<String>();

        // initialize the queue with all leaf nodes
        Set<String> leafNodes = new HashSet<String>();
        for (Entity leaf : lcc.getLeaves()) {
        queue.addAll(leafNodes); + " leaf nodes.");

        ProgressMeter progress = new ProgressMeter(getNumberOfNodes());
        // while the queue is not empty
        while (!queue.isEmpty()) {
            // remove first element from queue
            String currNodeId = queue.poll();
            Entity currNode = lexSemRes.getEntityById(currNodeId);

            // in some rare cases, getEntityById might fail - so better check for nulls and fail
            // gracefully
            if (currNode == null) {
                hyponymCountMap.put(currNodeId, 0);


            if (visited.contains(currNodeId)) {


            if (logger.isDebugEnabled()) {
                logger.debug(progress + " - " + queue.size() + " left in queue");
            } else if (logger.isInfoEnabled() && (progress.getCount() % 100 == 0)) {
       + " - " + queue.size() + " left in queue");

            Set<Entity> children = lcc.getChildren(currNode);
            Set<String> invalidChildIds = new HashSet<String>();
            int validChildren = 0;
            int sumChildHyponyms = 0;
            boolean invalid = false;
            for (Entity child : children) {
                if (lcc.containsVertex(child)) {
                    if (hyponymCountMap.containsKey(child.getId())) {
                        sumChildHyponyms += hyponymCountMap.get(child.getId());
                    } else {
                        invalid = true;

            // we cannot use continue directly if invalid as this would continue the inner loop not
            // the outer loop
            if (invalid) {
                // One of the childs is not in the hyponymCountMap yet
                // Re-Enter the node into the queue and continue with next node
                // Also enter all the childs that are not in the queue yet
                for (String childId : invalidChildIds) {
                    if (!visited.contains(childId) && !queue.contains(childId)) {

            // mark as visited

            // number of hyponomys of current node is the number of its own hyponyms and the sum of
            // the hyponyms of its children.
            int currNodeHyponomyCount = validChildren + sumChildHyponyms;
            hyponymCountMap.put(currNodeId, currNodeHyponomyCount);

            // add parents of current node to queue
            for (Entity parent : lcc.getParents(currNode)) {
                if (lcc.containsVertex(parent)) {
        } // while queue not empty + " nodes visited");
        if (visited.size() != nrOfNodes) {
            List<Entity> missed = new ArrayList<Entity>();
            for (Entity e : lcc.getNodes()) {
                if (!visited.contains(e.getId())) {
                    System.out.println("Missed: [" + e + "]");

            throw new LexicalSemanticResourceException(
                    "Visited only " + visited.size() + " out of " + nrOfNodes + " nodes.");
        if (hyponymCountMap.size() != nrOfNodes) {
            throw new LexicalSemanticResourceException(
                    "HyponymCountMap does not contain an entry for each node in the graph." + hyponymCountMap.size()
                            + "/" + nrOfNodes);

         * As an EntityGraph is a graph rather than a tree, the hyponymCount for top nodes can be
         * greater than the number of nodes in the graph. This is due to the multiple counting of nodes
         * having more than one parent. Thus, we have to scale hyponym counts to fall in
         * [0,NumberOfNodes].
        for (String key : hyponymCountMap.keySet()) {
            if (hyponymCountMap.get(key) > hyponymCountMap.size()) {
                // TODO scaling function is not optimal (to say the least :)
                hyponymCountMap.put(key, (hyponymCountMap.size() - 1));
        }"Computed hyponymCountMap");
        EntityGraphUtils.serializeMap(hyponymCountMap, hyponymCountMapSerializedFile);"Serialized hyponymCountMap");

        return hyponymCountMap;

     * Computes the paths from each entity node to the root. Computing n paths will take some time.
     * Thus, efficient computing is based on the assumption that all subpaths in the shortest path
     * to the root, are also shortest paths for the corresponding nodes. Starting with the leaf
     * nodes gives the longest initial paths with most subpaths.
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
    private void createRootPathMap() throws LexicalSemanticResourceException {

        // do only create rootPathMap, if it was not already computed
        if (rootPathMap != null) {

        File rootPathFile = new File(this.getGraphId() + "_" + this.rootPathMapFilename
                + (lexSemRes.getIsCaseSensitive() ? "-cs" : "-cis"));

        // try to load rootPathMap from precomputed file
        if (rootPathFile.exists()) {
  "Loading saved rootPathMap ...");
            rootPathMap = EntityGraphUtils.deserializeMap(rootPathFile);
  "Done loading saved rootPathMap");
        }"Computing rootPathMap");
        rootPathMap = new HashMap<String, List<String>>();

        // a queue holding the nodes to process
        List<Entity> queue = new ArrayList<Entity>();

        // initialize the queue with all leaf nodes
        queue.addAll(this.getLeaves()); + " leaf nodes.");

        queue.clear(); // queue should be empty now, but clear anyway

        // add non-leaf nodes that have not been on a shortest, yet
        for (Entity entity : this.getNodes()) {
            if (!rootPathMap.containsKey(entity.getId())) {
        } + " non leaf nodes not on a shortest leaf-node to root path.");

        for (Entity entity : this.getNodes()) {
            if (!rootPathMap.containsKey(entity.getId())) {
      "no path for " + entity.getId());
        }"Root path map contains " + rootPathMap.size() + " entries.");

        // from the root path map, we can very easily get the depth
        this.depth = getDepthFromRootPathMap();"Setting depth of entity graph: " + this.depth);"Serializing rootPathMap");
        EntityGraphUtils.serializeMap(rootPathMap, rootPathFile);

    private void fillRootPathMap(List<Entity> queue) throws LexicalSemanticResourceException {
        Entity root = lexSemRes.getRoot();
        String rootId = root.getId();

        if (!this.getGraph().containsVertex(root)) {
            throw new LexicalSemanticResourceException("Cannot fill rootPathMap if graph does not contain root.");

        // while the queue is not empty
        while (!queue.isEmpty()) {
            // remove first element from queue
            Entity currentNode = queue.get(0);
            String currentNodeId = currentNode.getId();

            logger.debug("Queue size: " + queue.size());

            // if we have already insert a path for this node => continue with the next
            if (getRootPathMap().containsKey(currentNodeId)) {

            // compute path from current node to root
            List<String> nodesOnPath = getPathToRoot(root, currentNode);

            // if there is no path => skip
            if (nodesOnPath == null || nodesOnPath.size() == 0) {
                getRootPathMap().put(currentNodeId, new ArrayList<String>());

            // the first entry should be the current Node, the last entry should be the root
            // check whether this assumption is valid
            if (!nodesOnPath.get(0).equals(currentNodeId) || // the first node of the list should
            // always be the current node
                    !nodesOnPath.get(nodesOnPath.size() - 1).equals(rootId)) { // the last node of
                // the list should
                // always be the
                // root node
                logger.error("Something is wrong with the path to the root");
                logger.error(nodesOnPath.get(0) + " -- " + currentNodeId);
                logger.error(nodesOnPath.get(nodesOnPath.size() - 1) + " -- " + rootId);
                throw new LexicalSemanticResourceException("Something is wrong with a path to the root");

            int i = 0;
            for (String nodeOnPath : nodesOnPath) {
                // if we have already insert a path for this node => continue with the next
                if (getRootPathMap().containsKey(nodeOnPath)) {
                // insert path
                else {
                            new ArrayList<String>(nodesOnPath.subList(i, nodesOnPath.size())));
        } // while queue not empty

     * This parameter is already set in the constructor as it is needed for computation of
     * relatedness values. Therefore its computation does not trigger setGraphParameters (it is too
     * slow), even if the depth is implicitly determined there, too.
     * @return The depth of the category graph, i.e. the maximum path length starting with the root
     *         node.
     * @throws LexicalSemanticResourceException
     * @throws UnsupportedOperationException
    private double getDepthFromRootPathMap() throws LexicalSemanticResourceException {
        int max = 0;
        for (List<String> path : getRootPathMap().values()) {
            if (path.size() > max) {
                max = path.size();

        max = max - 1; // depth is measured in nodes, not edges

        if (max < 0) {
            return 0;
        } else {
            return max;

     * Returns the shortest path from node to root as a list of pageIds of the nodes on the path.
     * Node and root are included in the path node list.
     * @param root
     *            The root node of the graph.
     * @param node
     *            A node of the graph.
     * @return The shortest path from node to root as a list of pagIs of the nodes on the path; or
     *         null if no path exists
     * @throws LexicalSemanticResourceException
    private List<String> getPathToRoot(Entity root, Entity node) throws LexicalSemanticResourceException {
        List<String> pathToRoot = new LinkedList<String>();
        List<String> shortestPath = new ArrayList<String>();

        expandPath(root, node, pathToRoot, shortestPath, 0);

        if (shortestPath.size() == 0) {
            return null;
        } else {
            return shortestPath;

    private void expandPath(Entity root, Entity currentNode, List<String> currentPath, List<String> shortestPath,
            int expansionDepth) throws LexicalSemanticResourceException {

        if (expansionDepth > MAX_EXPANSION_DEPTH) {

        // add the current node to the path

        // if root node reached, check whether it is a shortest path
        if (currentNode.getId().equals(root.getId())) {
            logger.debug("found root");

            if (shortestPath.size() != 0) {
                if (currentPath.size() < shortestPath.size()) {
                    logger.debug("setting new shortest path");
            } else {
                logger.debug("initializing shortest path");

        // do not expand paths that are longer or equal than the current shortest path
        // this is a runtime efficiency optimization!
        if (shortestPath.size() != 0 && currentPath.size() >= shortestPath.size()) {

        Set<DefaultEdge> incomingEdges = this.getGraph().incomingEdgesOf(currentNode);

        // no incoming edges => return path without adding this node
        if (incomingEdges == null || incomingEdges.size() == 0) {
            logger.debug("found non-root source");

        for (DefaultEdge incomingEdge : incomingEdges) {
            Entity sourceNode = this.getGraph().getEdgeSource(incomingEdge);

            if (sourceNode == currentNode) {
                logger.warn("Source node equals current node. Fatal error.");
                throw new LexicalSemanticResourceException("Source node equals current node. Fatal error.");
            List<String> savedPath = new LinkedList<String>(currentPath);
            expandPath(root, sourceNode, currentPath, shortestPath, ++expansionDepth);


    protected Map<String, List<String>> getRootPathMap() throws LexicalSemanticResourceException {
        if (rootPathMap == null) {
        return this.rootPathMap;

    public List<String> getPageRank() {
        throw new UnsupportedOperationException();

    public List<String> getHITS() {
        throw new UnsupportedOperationException();

    public boolean containsCycles() throws LexicalSemanticResourceException {
        boolean hasCycles = cycleHandler.containsCycle();"The graph contains cycles: " + hasCycles);
        return hasCycles;

    public void removeCycles() throws LexicalSemanticResourceException {
        if (cycleHandler.containsCycle()) {
  "Overwriting serialized graph file with cycle-free version.");
            try {
                GraphSerialization.saveGraph(directedGraph, serializedGraphFile);
            } catch (IOException e) {
                throw new LexicalSemanticResourceException(e);


    public String getGraphId() {
        return this.graphId;

    public Set<Set<Entity>> getStructuralEquivalences() {
        throw new UnsupportedOperationException();

    public void setHyponymCountMapUseLcc(boolean aHyponymCountMapUseLcc) {
        hyponymCountMapUseLcc = aHyponymCountMapUseLcc;

    public boolean isHyponymCountMapUseLcc() {
        return hyponymCountMapUseLcc;