edu.stanford.muse.graph.directed.Digraph.java Source code

Introduction

Here is the source code for edu.stanford.muse.graph.directed.Digraph.java
Source

/*
 Copyright (C) 2012 The Stanford MobiSocial Laboratory
    
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    
   http://www.apache.org/licenses/LICENSE-2.0
    
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/
package edu.stanford.muse.graph.directed;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import edu.stanford.muse.util.Pair;
import edu.stanford.muse.util.Triple;
import edu.stanford.muse.util.UnionFindSet;
import edu.stanford.muse.util.Util;

public class Digraph<T> {
    public static Log log = LogFactory.getLog(Digraph.class);

    public Map<T, DigraphNode<T>> allNodes;
    public int nEdges;

    public PrintStream out = System.out;

    private void setOut(String file) throws FileNotFoundException {
        out = new PrintStream(file);
    }

    public Digraph() {
        allNodes = new LinkedHashMap<T, DigraphNode<T>>();
    }

    /**
     * adds a node to the Graph
     */
    public DigraphNode<T> add(T t) {
        DigraphNode<T> n = allNodes.get(t);
        if (n != null)
            return n;

        n = new DigraphNode<T>(t);
        allNodes.put(t, n);
        return n;
    }

    /**
     * verifies graph data structure consistency
     */
    public void verify() {
        for (T t : allNodes.keySet()) {
            DigraphNode<T> n = allNodes.get(t);
            Util.ASSERT(t.equals(n.payload));
            n.verify();
        }
    }

    public String toString() {
        StringBuffer sb = new StringBuffer();
        sb.append("Graph with " + allNodes.size() + " nodes\n");
        for (DigraphNode<T> n : allNodes.values())
            sb.append(n + "\n");

        return sb.toString();
    }

    public List<List<DigraphNode<T>>> findComponentSizes() {
        UnionFindSet<DigraphNode<T>> ufs = new UnionFindSet<DigraphNode<T>>();
        for (DigraphNode<T> n : allNodes.values()) {
            for (DigraphNode<T> n1 : n.succNodes.keySet())
                ufs.unify(n, n1);
        }
        return ufs.getClassesSortedByClassSize();
    }

    public Map<Integer, Integer> componentSizeScatterPlot() {
        Map<Integer, Integer> map = new LinkedHashMap<Integer, Integer>();
        List<List<DigraphNode<T>>> list = findComponentSizes();
        for (List<DigraphNode<T>> l : list) {
            int size = l.size();
            Integer x = map.get(size);
            if (x == null)
                map.put(size, 1);
            else
                map.put(size, x + 1);
        }
        return map;
    }

    /** return outgoing degree distribution in the graph */
    @SuppressWarnings("unchecked")
    public List<Pair<Integer, Integer>> getDegreeDistribution() {
        Map<Integer, Integer> map = new LinkedHashMap<Integer, Integer>();

        for (DigraphNode<T> n : allNodes.values()) {
            int degree = n.succNodes.size();
            Integer nNodes = map.get(degree);
            if (nNodes == null)
                map.put(degree, 1);
            else
                map.put(degree, nNodes + 1);
        }

        List<Pair<Integer, Integer>> result = new ArrayList<Pair<Integer, Integer>>();
        for (Map.Entry<Integer, Integer> e : map.entrySet())
            result.add(new Pair<Integer, Integer>(e.getKey(), e.getValue()));
        Util.sortPairsByFirstElement((List) result);
        return result;
    }

    public double clusteringCoeff() {
        double sum = 0.0;
        for (DigraphNode<T> n : allNodes.values())
            sum += clusteringCoeff(n);
        return sum / allNodes.size();
    }

    public static double clusteringCoeff(DigraphNode<?> n) {
        int triangles = 0;
        for (DigraphNode<?> succ1 : n.succNodes.keySet())
            for (DigraphNode<?> succ2 : n.succNodes.keySet())
                if (succ1.succNodes.keySet().contains(succ2)) {
                    Util.ASSERT(succ2.succNodes.keySet().contains(succ1)); // because undirected graph
                    triangles++;
                }

        int size = n.succNodes.size();
        if (size <= 1)
            return 0.0;
        else
            return ((double) triangles) / (size * (size - 1));
    }

    /** returns pairs, sorted by first element */
    @SuppressWarnings({ "unchecked", "unused" })
    private static List<Pair<Integer, Integer>> convertMapToPairs(Map<Integer, Integer> map) {
        List<Pair<Integer, Integer>> result = new ArrayList<Pair<Integer, Integer>>();
        for (Map.Entry<Integer, Integer> e : map.entrySet())
            result.add(new Pair<Integer, Integer>(e.getKey(), e.getValue()));
        Util.sortPairsByFirstElement((List) result);
        return result;
    }

    public String stats() {
        long nEdges = 0;
        for (DigraphNode<?> n : allNodes.values())
            nEdges += n.succNodes.size();
        nEdges /= 2;
        return ("Graph with " + allNodes.size() + " nodes and " + nEdges + " edges");
    }

    public void clearAllEdges() {
        for (DigraphNode<?> n : allNodes.values()) {
            n.succNodes.clear();
        }
    }

    public int getLargestComponentSize() {
        Map<Integer, Integer> map = componentSizeScatterPlot();
        int maxSize = Integer.MIN_VALUE;
        for (Integer size : map.keySet())
            if (maxSize < size)
                maxSize = size;
        return maxSize;
    }

    @SuppressWarnings("unchecked")
    public List<Triple<DigraphNode<?>, DigraphNode<?>, Integer>> sortedEdgesByCommonCoauthors() {
        List<Triple<DigraphNode<?>, DigraphNode<?>, Integer>> triples = new ArrayList<Triple<DigraphNode<?>, DigraphNode<?>, Integer>>();

        for (DigraphNode<?> n : allNodes.values()) {
            for (DigraphNode<?> n1 : n.succNodes.keySet()) {
                if (n1.id <= n.id)
                    continue;
                int sharedNeighbours = 0;
                for (DigraphNode<?> n2 : n.succNodes.keySet()) {
                    if (n2 == n1)
                        continue;
                    if (n1.succNodes.keySet().contains(n2))
                        sharedNeighbours++;
                }
                triples.add(new Triple<DigraphNode<?>, DigraphNode<?>, Integer>(n, n1, sharedNeighbours));
            }
        }

        Util.sortTriplesByThirdElement((List) triples);
        //   for (Triple<Node<?>,Node<?>,Integer> t : triples)
        //      out.println ("Triple: " + t.getFirst() + " - " + t.getSecond() + " weight: " + t.getThird());
        return triples;
    }

    public String dump() {
        Map<T, Integer> mapNum = new LinkedHashMap<T, Integer>();
        StringBuilder sb = new StringBuilder();
        sb.append("nodedef>name INTEGER, label VARCHAR\n");
        int i = 0;
        for (T t : allNodes.keySet()) {
            mapNum.put(t, i);
            String label = t.toString().replaceAll(",", " "); // .replaceAll("&", "_"); // .replaceAll("/", "_").replaceAll("\\", "_");
            sb.append(i + "," + label + "\n");
            i++;
        }

        sb.append("edgedef>src INTEGER, dest INTEGER, weight double\n");

        for (T t : allNodes.keySet()) {
            Map<DigraphNode<T>, Float> conns = allNodes.get(t).succNodes;
            if (conns == null)
                continue;

            for (DigraphNode<T> n1 : conns.keySet()) {
                T from = t;
                T to = n1.payload;
                float weight = conns.get(n1);
                sb.append(mapNum.get(from) + "," + mapNum.get(to) + "," + weight + "\n");
            }
        }
        return sb.toString();
    }

    public void addEdge(T t1, T t2, float weight) {
        DigraphNode<T> n1 = add(t1);
        DigraphNode<T> n2 = add(t2);
        int orig_size = n1.succNodes.size();
        n1.succNodes.put(n2, weight);
        if (n1.succNodes.size() > orig_size) {
            nEdges++;
            if (nEdges % 1000 == 0)
                System.out.println("#edges = " + nEdges);
        }
    }

    /** input is a set of docid -> terms in the doc map 
     * @throws FileNotFoundException */
    public static void doIt(Map<Integer, Collection<Collection<String>>> docMap, String outfile)
            throws FileNotFoundException {
        // index stores for each term, count of how many times it co-occurs with another in a doc.
        Map<String, Map<String, Integer>> index = new LinkedHashMap<String, Map<String, Integer>>();
        Map<String, Integer> termFreq = new LinkedHashMap<String, Integer>();

        // compute index
        for (Integer num : docMap.keySet()) {
            Collection<Collection<String>> paras = docMap.get(num);

            for (Collection<String> paraNames : paras) {
                System.out.println(num + ". " + paraNames.size() + " names " + " prev index size " + index.size()
                        + " term freq size " + termFreq.size());
                if (paraNames.size() > 100) {
                    log.warn("skipping long para" + paraNames);
                    continue;
                }

                for (String s : paraNames) {
                    s = s.toLowerCase();
                    // bump up term freq for this term
                    Integer X = termFreq.get(s);
                    termFreq.put(s, (X == null) ? 1 : X + 1);

                    // bump up counts for co-occurring terms... 
                    // unfortunately n^2 operation here
                    for (String s1 : paraNames) {
                        if (s == s1)
                            continue;

                        Map<String, Integer> termMap = index.get(s);
                        if (termMap == null) {
                            // allocate termMap if this is the first time we've seen s
                            termMap = new LinkedHashMap<String, Integer>(1);
                            index.put(s, termMap);
                        }

                        // bump the count
                        Integer I = termMap.get(s1);
                        termMap.put(s1, (I == null) ? 1 : I + 1);
                    }
                }
            }
        }

        // process index and store it as a graph structure

        Digraph<String> graph = new Digraph<String>();
        for (String term : index.keySet()) {
            Map<String, Integer> map = index.get(term);
            if (map == null) {
                // no edges, just add it to the graph and continue
                graph.add(term);
                continue;
            }

            // compute total co-occurrence across all other terms this term is associated with
            int total = 0;
            for (Integer x : map.values())
                total += x;
            // proportionately allocate weight
            for (String x : map.keySet())
                graph.addEdge(term, x, ((float) map.get(x)));
            //      graph.addEdge(term, x, ((float) map.get(x))/total);
        }
        String s = graph.dump();
        PrintStream pw = new PrintStream(new FileOutputStream(outfile));
        pw.print(s);
        pw.close();
    }

}