de.tudarmstadt.ukp.wikipedia.api.WikipediaInfo.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.wikipedia.api.WikipediaInfo.java

Source

/*******************************************************************************
 * Copyright (c) 2010 Torsten Zesch.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/lgpl.html
 *
 * Contributors:
 *     Torsten Zesch - initial API and implementation
 ******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.api;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hibernate.Session;

import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiPageNotFoundException;
import de.tudarmstadt.ukp.wikipedia.util.ApiUtilities;

/** Holds numerous information on a given subset (that may also be
 * the whole Wikipedia) of Wikipedia nodes.
 * @author zesch
 */
public class WikipediaInfo {

    private final Log logger = LogFactory.getLog(getClass());

    private Iterable<Page> pages;
    private double averageFanOut;

    private int numberOfPages;

    private Map<Integer, Integer> degreeDistribution;
    private Set<Integer> categorizedArticleSet;

    private Wikipedia wiki;

    /**
     * Get infos for the whole wikipedia.
     * @param pWiki The wiki object.
     */
    public WikipediaInfo(Wikipedia pWiki) throws WikiApiException {
        this.wiki = pWiki;
        new WikipediaInfo(this.wiki.getPages());

    }

    /**
     * Get infos only for a subset of articles.
     * @param pPages A set of pages. Only this subset of wiki pages is used in the info object.
     */
    public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException {
        if (pPages == null) {
            throw new WikiApiException("The page set has to be initialized.");
        }

        pages = pPages;
        averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is accessed

        degreeDistribution = new HashMap<Integer, Integer>();
        categorizedArticleSet = new HashSet<Integer>();

        // get number of pages
        numberOfPages = 0;
        while (pages.iterator().hasNext()) {
            numberOfPages++;
            pages.iterator().next();
        }

    }

    /** Computes the average fan out of the page set.
     * Fan out is the number of outgoing links per page.
     * @param pageIDs The IDs of the pages.
     * @return The average fan out.
     */
    private double computeAverageFanOut(Iterable<Page> pages) {

        Set<Integer> pageIDs = new HashSet<Integer>();
        while (pages.iterator().hasNext()) {
            pageIDs.add(pages.iterator().next().getPageId());
        }

        if (pageIDs == null) {
            logger.error("Cannot compute average fan-out of an empty page set.");
            return 0.0;
        }

        int fanOutCounter = 0;

        Session session = this.wiki.__getHibernateSession();
        session.beginTransaction();
        Iterator results = session.createQuery("select page.outLinks, page.pageId from Page as page").list()
                .iterator();
        while (results.hasNext()) {
            Object[] row = (Object[]) results.next();
            Set outLinks = (Set) row[0];
            Integer pageId = (Integer) row[1];

            // if the current page ID is in the desired result set => add outlink value
            if (pageIDs.contains(pageId)) {
                fanOutCounter += outLinks.size();
            }

        }
        session.getTransaction().commit();

        return (double) fanOutCounter / this.getNumberOfPages();
    }

    /**
     * @return Returns the averageFanOut.
     */
    public double getAverageFanOut() {
        if (averageFanOut < 0) { // not yet initialized
            averageFanOut = computeAverageFanOut(this.pages);
        }

        return averageFanOut;
    }

    /**
     * @return Returns the numberOfPages.
     */
    public int getNumberOfPages() {
        return numberOfPages;
    }

    /**
     * Building a mapping from categories to article sets.
     * @parm pWiki The wikipedia object.
     * @param pNodes The category nodes that should be used to build the map.
     * @return A mapping from categories to article sets.
     * @throws WikiPageNotFoundException
     */
    private Map<Integer, Set<Integer>> getCategoryArticleMap(Wikipedia pWiki, Set<Integer> pNodes)
            throws WikiPageNotFoundException {
        Map<Integer, Set<Integer>> categoryArticleMap = new HashMap<Integer, Set<Integer>>();

        int progress = 0;
        for (int node : pNodes) {
            progress++;
            ApiUtilities.printProgressInfo(progress, pNodes.size(), 10, ApiUtilities.ProgressInfoMode.TEXT,
                    "Getting category-article map.");

            Category cat = pWiki.getCategory(node);
            if (cat != null) {
                Set<Integer> pages = new HashSet<Integer>(cat.__getPages());
                categoryArticleMap.put(node, pages);
            } else {
                logger.info(node + " is not a category.");
            }
        }

        return categoryArticleMap;
    }

    /**
     * Get various graph parameters like diameter, average out-degree etc of the categroy graph.
     * @param catGraph The category graph.
     */
    public void getGraphParameters(CategoryGraph catGraph) {
        double startTime = System.currentTimeMillis();
        logger.error(catGraph.getGraphInfo());
        double endTime = (System.currentTimeMillis() - startTime) / 1000.0;
        logger.error(endTime + "s");
    }

    /**
     * Articles in wikipedia may be tagged with multiple categories.
     * It may be interesting to know how many articles have at least one category in common.
     * Such articles would have a very high semantic relatedness even if they share a quite secondary category.
     * @param pWiki The wikipedia object.
     * @param catGraph The category graph.
     * @throws WikiApiException
     */
    public void getOverlapping(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException {
        double startTime = System.currentTimeMillis();

        int articlesWithOverlappingCategories = getArticlesWithOverlappingCategories(pWiki, catGraph);
        double overlappingCategoriesRatio = (double) articlesWithOverlappingCategories
                / (double) pWiki.getMetaData().getNumberOfPages();
        logger.error(articlesWithOverlappingCategories + " - " + pWiki.getMetaData().getNumberOfPages() + " - "
                + overlappingCategoriesRatio);

        double endTime = (System.currentTimeMillis() - startTime) / 1000.0;
        logger.error(endTime + "ms");
    }

    /**
     * Articles in wikipedia may be tagged with multiple categories.
     * It may be interesting to know how many articles have at least one category in common.
     * Such articles would have a very high semantic relatedness even if they share a quite secondary category.
     * @param pWiki The wikipedia object.
     * @param pGraph The category graph.
     * @return The number of articles that have at least one category in common.
     * @throws WikiPageNotFoundException
     */
    private int getArticlesWithOverlappingCategories(Wikipedia pWiki, CategoryGraph pGraph)
            throws WikiPageNotFoundException {
        Set<Integer> overlappingArticles = new HashSet<Integer>();

        // iterate over all node pairs
        Set<Integer> nodes = pGraph.getGraph().vertexSet();

        Map<Integer, Set<Integer>> categoryArticleMap = getCategoryArticleMap(pWiki, nodes);

        // sort the Array so we can use a simple iteration with two for loops to access all pairs
        Object[] nodeArray = nodes.toArray();
        Arrays.sort(nodeArray);

        int progress = 0;
        for (int i = 0; i < nodes.size(); i++) {
            progress++;
            ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "");

            int outerNode = (Integer) nodeArray[i];

            for (int j = i + 1; j < nodes.size(); j++) {
                int innerNode = (Integer) nodeArray[j];

                // test whether the categories have pages in common
                Set<Integer> outerPages = categoryArticleMap.get(outerNode);
                Set<Integer> innerPages = categoryArticleMap.get(innerNode);

                for (int outerPage : outerPages) {
                    if (innerPages.contains(outerPage)) {
                        if (!overlappingArticles.contains(outerPage)) {
                            overlappingArticles.add(outerPage);
                        }
                    }
                }

            }
        }

        return overlappingArticles.size();
    }

    public void getCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException {
        double startTime = System.currentTimeMillis();

        int numberOfCategorizedArticles = getNumberOfCategorizedArticles(pWiki, catGraph);
        double categorizedArticlesRatio = (double) numberOfCategorizedArticles
                / (double) pWiki.getMetaData().getNumberOfPages();

        logger.info("Categorized articles: " + numberOfCategorizedArticles);
        logger.info("All articles:         " + pWiki.getMetaData().getNumberOfPages());
        logger.info("Ratio:                " + categorizedArticlesRatio);

        double endTime = (System.currentTimeMillis() - startTime) / 1000.0;
        logger.error(endTime + "ms");
    }

    public double getAveragePathLengthFromRoot(Wikipedia pWiki, CategoryGraph connectedCatGraph)
            throws WikiApiException {
        // get root node
        Category rootCategory = pWiki.getMetaData().getMainCategory();
        int root = rootCategory.getPageId();

        int pathLengthSum = computeShortestPathLenghts(root, connectedCatGraph);

        return (double) pathLengthSum / (connectedCatGraph.getGraph().vertexSet().size() - 1);
    }

    /**
     * If the return value has been already computed, it is returned, else it is computed at retrieval time.
     * @param pWiki The wikipedia object.
     * @param catGraph The category graph.
     * @return The number of categorized articles, i.e. articles that have at least one category.
     */
    public int getNumberOfCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException {
        if (categorizedArticleSet == null) { // has not been initialized yet
            iterateCategoriesGetArticles(pWiki, catGraph);
        }
        return categorizedArticleSet.size();
    }

    /**
     * Computes the distribution of the number of articles per category.
     * If the return value has been already computed, it is returned, else it is computed at retrieval time.
     * @param pWiki The wikipedia object.
     * @param catGraph The category graph.
     * @return A map containing the distribution mapping from a degree to the number of times this degree is found in the category graph.
     * @throws WikiPageNotFoundException
     */
    public Map<Integer, Integer> getDistributionOfArticlesByCategory(Wikipedia pWiki, CategoryGraph catGraph)
            throws WikiPageNotFoundException {
        if (degreeDistribution == null) { // has not been initialized yet
            iterateCategoriesGetArticles(pWiki, catGraph);
        }
        return degreeDistribution;
    }

    /**
     * Methods computing stuff that have to iterate over all categories and access category articles can plug-in here.
     * Recently plugin-in:
     *      numberOfCategorizedArticles
     *      distributionOfArticlesByCategory
     * @param pWiki The wikipedia object.
     * @param catGraph The category graph.
     * @return
     * @throws WikiPageNotFoundException
     */
    private void iterateCategoriesGetArticles(Wikipedia pWiki, CategoryGraph catGraph)
            throws WikiPageNotFoundException {
        Map<Integer, Integer> localDegreeDistribution = new HashMap<Integer, Integer>();
        Set<Integer> localCategorizedArticleSet = new HashSet<Integer>();
        Set<Integer> categoryNodes = catGraph.getGraph().vertexSet();
        // iterate over all categories
        int progress = 0;
        for (int node : categoryNodes) {
            progress++;
            ApiUtilities.printProgressInfo(progress, categoryNodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT,
                    "iterate over categories");

            // get the category
            Category cat = pWiki.getCategory(node);
            if (cat != null) {
                Set<Integer> pages = new HashSet<Integer>(cat.__getPages());

                // update degree distribution map
                int numberOfArticles = pages.size();
                if (localDegreeDistribution.containsKey(numberOfArticles)) {
                    int count = localDegreeDistribution.get(numberOfArticles);
                    count++;
                    localDegreeDistribution.put(numberOfArticles, count);
                } else {
                    localDegreeDistribution.put(numberOfArticles, 1);
                }

                // add the page to the categorized articles set, if it is to already in it
                for (int page : pages) {
                    if (!localCategorizedArticleSet.contains(page)) {
                        localCategorizedArticleSet.add(page);
                    }
                }
            } else {
                logger.info(node + " is not a category.");
            }
        }
        this.degreeDistribution = localDegreeDistribution;
        this.categorizedArticleSet = localCategorizedArticleSet;
    }

    /**
     * Computes the shortest path from node to all other nodes.
     * As the JGraphT BreadthFirstIterator does not provide information about
     *   the distance to the start node in each step, we will use our own BFS implementation.
     * @param pStartNode The start node of the search.
     * @param catGraph The category graph.
     * @return An array of double values.
     */
    private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) {
        int shortestPathLengthSum = 0;

        // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back
        Set<Integer> alreadyExpanded = new HashSet<Integer>();

        // a queue holding the newly discovered nodes with their and their distance to the start node
        List<int[]> queue = new ArrayList<int[]>();

        // initialize queue with start node
        int[] innerList = new int[2];
        innerList[0] = pStartNode; // the node
        innerList[1] = 0; // the distance to the start node
        queue.add(innerList);

        // while the queue is not empty
        while (!queue.isEmpty()) {
            // remove first element from queue
            int[] queueElement = queue.get(0);
            int currentNode = queueElement[0];
            int distance = queueElement[1];
            queue.remove(0);

            // if the node was not already expanded
            if (!alreadyExpanded.contains(currentNode)) {
                // the node gets expanded now
                alreadyExpanded.add(currentNode);

                // add the distance of this node to shortestPathLengthSum
                shortestPathLengthSum += distance;

                // get the neighbors of the queue element
                Set<Integer> neighbors = catGraph.getNeighbors(currentNode);

                // iterate over all neighbors
                for (int neighbor : neighbors) {
                    // if the node was not already expanded
                    if (!alreadyExpanded.contains(neighbor)) {
                        // add the node to the queue, increase node distance by one
                        int[] tmpList = new int[2];
                        tmpList[0] = neighbor;
                        tmpList[1] = (distance + 1);
                        queue.add(tmpList);
                    }
                }
            }
        }
        return shortestPathLengthSum;
    }

}