org.wikimedia.analytics.kraken.funnel.Funnel.java Source code

Java tutorial

Introduction

Here is the source code for org.wikimedia.analytics.kraken.funnel.Funnel.java

Source

/**
 *Copyright (C) 2012-2013  Wikimedia Foundation
 *
 *This program is free software; you can redistribute it and/or
 *modify it under the terms of the GNU General Public License
 *as published by the Free Software Foundation; either version 2
 *of the License, or (at your option) any later version.
 *
 *This program is distributed in the hope that it will be useful,
 *but WITHOUT ANY WARRANTY; without even the implied warranty of
 *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *GNU General Public License for more details.
 *
 *You should have received a copy of the GNU General Public License
 *along with this program; if not, write to the Free Software
 *Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
    
 */

/*
 * http://meta.wikimedia.org/wiki/Research:Metrics#Funnel_metrics
 */
package org.wikimedia.analytics.kraken.funnel;

import org.wikimedia.analytics.kraken.exceptions.MalformedFunnelException;

import java.util.*;
import java.util.Map.Entry;
import java.util.regex.PatternSyntaxException;

import org.jgrapht.DirectedGraph;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.traverse.DepthFirstIterator;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;

import org.apache.commons.lang3.tuple.Pair;

/**
 * Funnel fun!.
 *
 * This class provides automated funnel analysis using a social network analysis
 * approach. Two parameters need to be provided:
 * 1) The Funnel definition
 * 2) A browsing history of a single user
 *
 * Funnel Definition
 * A funnel is defined as a Directed Acyclic Graph (DAG), the simplest funnel
 * would look like: A -> B -> C
 * You have to provide the different paths that make up a funnel. The {A,B,C}
 * funnel should be provided as "A,B,C;". Make sure that all steps in the funnel
 * are valid Node's.
 *
 * Browsing History
 * The browsing history should be a time sorted list of Node's visited by a
 * single visitor, ideally during one session.
 */
public class Funnel {

    /** The funnel. */
    public Funnel funnel;
    /** The graph. */
    private DirectedGraph<FunnelNode, DefaultEdge> graph;

    /** The start vertices. */
    public List<FunnelNode> startVertices;

    /** The end vertices. */
    public List<FunnelNode> endVertices;

    /** The paths. */
    public final List<FunnelPath> paths = new ArrayList<FunnelPath>();

    /** Edge definition */
    private final Map<String, String> nodeDefinition = new HashMap<String, String>();

    /**
     * Constructor for the funnel.
     *
     * @throws MalformedFunnelException the malformed funnel exception
     */
    public Funnel() throws MalformedFunnelException {
        this("page", "A,B\n" + "B, C\n" + "D, B\n" + "B, E\n");
    }

    /**
     * Constructor for the funnel.
     *
     * @param funnelDefinition the funnel definition
     * @param nodeDefinition a String in the format <key1>:<key2>:<keyn>, these
     * keys *should* map to the keys as defined in the EventLogging schema that
     * is used for this particular funnel and should be present in the Node query
     * string.
     * @throws MalformedFunnelException the malformed funnel exception
     */
    public Funnel(String nodeDefinition, String funnelDefinition) throws MalformedFunnelException {
        this.parseNodeDefinition(nodeDefinition);
        System.out.println("Node definition: " + nodeDefinition);
        System.out.println("Funnel definition: " + funnelDefinition);
        this.graph = this.constructFunnelGraph(funnelDefinition);
        if (this.graph.edgeSet().size() == 0 || this.graph.vertexSet().size() < 2) {
            System.out.println(this.graph.toString());
            throw new MalformedFunnelException("A funnel needs to have two connected nodes at the very minimum.");
        }
        if (!isDag()) {
            throw new MalformedFunnelException("A funnel cannot have cycles (A->B->A).");
        }
        this.getStartingVertices();
        this.getDestinationVertices();
        this.determineUniquePaths();
    }

    public void parseNodeDefinition(String nodeDefinition) throws MalformedFunnelException {
        String[] params = nodeDefinition.split("=");
        int j;
        for (int i = 0; i + 1 < params.length; i++) {
            j = i + 1;
            this.nodeDefinition.put(params[i], params[j]);
        }
        if (this.nodeDefinition.size() == 0) {
            throw new MalformedFunnelException("Your node definition does not use use the colon as a separator.");
        }
    }

    /**
     * Construct graph.
     *
     * @param funnelDefinition the edges
     * @return the directed graph
     * @throws MalformedFunnelException
     * @throws PatternSyntaxException
     */
    public final DirectedGraph<FunnelNode, DefaultEdge> constructFunnelGraph(String funnelDefinition)
            throws PatternSyntaxException, MalformedFunnelException {
        String[] edgeList = funnelDefinition.split(";");
        DirectedGraph<FunnelNode, DefaultEdge> dg = new DefaultDirectedGraph<FunnelNode, DefaultEdge>(
                DefaultEdge.class);
        //TODO: probably faster to replace ArrayList with HashMap<String, Node>
        ArrayList<FunnelNode> nodes = new ArrayList<FunnelNode>();
        FunnelNode source;
        FunnelNode target;
        for (String edgeData : edgeList) {
            String[] edge = edgeData.split(",");
            source = new FunnelNode(edge[0]);
            target = new FunnelNode(edge[1]);

            //Curiously enough, there don't seem to be a method in jGraphT to get
            //a vertex from a graph, hence I keep an ArrayList of nodes that
            //already exist.
            if (!nodes.contains(source)) {
                dg.addVertex(source);
                nodes.add(source);
            } else {
                source = nodes.get(nodes.indexOf(source));
            }

            if (!nodes.contains(target)) {
                dg.addVertex(target);
                nodes.add(target);
            } else {
                target = nodes.get(nodes.indexOf(target));
            }

            if (!dg.containsEdge(source, target)) {
                dg.addEdge(source, target);
            }
        }
        System.out.println("Graph Summary: " + dg.toString());

        return dg;
    }

    /**
     * Simple wrapper script that conducts all the steps to do a funnel
     * analysis.
     * @param userToken unique identifier for a visitor.
     * @param history the {@link DirectedGraph} describing the {@link UserActionNode} that the visitor took.
     */
    public final void analysis(String userToken, DirectedGraph<Node, DefaultEdge> history) {
        Analysis analysis = new Analysis();
        Result result = analysis.run(userToken, history, this);
        //analysis.printResults(result);
    }

    public final void aggregateResults() {
        for (FunnelPath path : this.paths) {
            for (FunnelNode node : path.nodes) {
                System.out.println(
                        "Path id: " + path.id + " Node id: " + node.toString() + " impressions: " + node.impression
                                + " bounced: " + node.bounced + " Node completion rate: " + node.completionRate());
            }
        }
    }

    /**
     * Determine all the unique paths between all the {@link this.startVertices} and {@link this.endVertices}.
     */
    private void determineUniquePaths() {
        this.paths.clear();

        for (FunnelNode startVertex : startVertices) {
            ArrayList<FunnelNode> breadcrumbs = new ArrayList<FunnelNode>();
            findPathToEnd(startVertex, breadcrumbs);
        }
    }

    private void findPathToEnd(FunnelNode from, ArrayList<FunnelNode> breadcrumbs) {
        breadcrumbs.add(from);
        for (DefaultEdge edge : graph.outgoingEdgesOf(from)) {
            FunnelNode to = graph.getEdgeTarget(edge);
            if (endVertices.contains(to)) {
                FunnelPath newPath = new FunnelPath(paths.size());
                newPath.nodes.addAll(breadcrumbs);
                newPath.nodes.add(to);
                paths.add(newPath);
            } else {
                findPathToEnd(to, breadcrumbs);
            }
        }
    }

    /**
     * Retrieve all the possible start vertices from the funnel.
     */
    private void getStartingVertices() {
        startVertices = new ArrayList<FunnelNode>();
        Set<FunnelNode> vertices = graph.vertexSet();
        for (FunnelNode vertex : vertices) {
            if (graph.inDegreeOf(vertex) == 0) {
                startVertices.add(vertex);
            }
        }
    }

    /**
     * Retrieve all the possible destination vertices from the funnel.
     */
    private void getDestinationVertices() {
        endVertices = new ArrayList<FunnelNode>();
        for (FunnelNode vertex : graph.vertexSet()) {
            if (graph.outDegreeOf(vertex) == 0) {
                endVertices.add(vertex);
            }
        }
    }

    /**
     * Checks if funnel is a DAG.
     *
     * @return true if the funnel is valid directed acyclic graph (DAG) else
     * return false.
     */
    public boolean isDag() {
        CycleDetector<FunnelNode, DefaultEdge> cycle = new CycleDetector<FunnelNode, DefaultEdge>(this.graph);
        return !cycle.detectCycles();
    }

    /*
     * This function has input a map with as key the usertoken string and as
     * value another map. In this second map, the key is a datetime and the
     * value is the @{link jsonObject} that contains the actual eventLogging
     * data for that token/timestamp combination.
     *
     * This function iterates over all usertokens, over all dates and constructs
     * for each usertoken a new graph representing the actions of that
     * user token.
     *
     * @returns a map with as key the usertoken and as value the graph
     * representing their actions.
     */
    public Map<String, DirectedGraph<Node, DefaultEdge>> constructUserGraph(
            Map<String, Map<Date, JsonObject>> jsonData) {
        Map<String, DirectedGraph<Node, DefaultEdge>> graphs = new HashMap<String, DirectedGraph<Node, DefaultEdge>>();
        Node source;
        Node target;
        JsonObject sourceJson;
        JsonObject targetJson;
        for (Entry<String, Map<Date, JsonObject>> kv : jsonData.entrySet()) {
            DirectedGraph<Node, DefaultEdge> dg = new DefaultDirectedGraph<Node, DefaultEdge>(DefaultEdge.class);
            Set<Date> datesSet = kv.getValue().keySet();
            ArrayList<Date> dates = new ArrayList<Date>(datesSet);
            Collections.sort(dates);
            int j;
            for (int i = 0; i + 1 <= dates.size(); i++) {
                j = i + 1;

                sourceJson = kv.getValue().get(dates.get(i)).getAsJsonObject();
                try {
                    targetJson = kv.getValue().get(dates.get(j)).getAsJsonObject();
                } catch (IndexOutOfBoundsException e) {
                    // This exception only happens when there is only a single event for a userToken
                    // and it means that we only captured one event, so no edges in the usergraph.
                    targetJson = null;
                }

                source = addUserActionNodeToGraph(dg, sourceJson);
                target = addUserActionNodeToGraph(dg, targetJson);

                if (!dg.containsEdge(source, target) && source != null && target != null) {
                    dg.addEdge(source, target);
                }
            }
            graphs.put(kv.getKey(), dg);
        }
        return graphs;
    }

    private UserActionNode addUserActionNodeToGraph(DirectedGraph<Node, DefaultEdge> graph, JsonObject json) {
        UserActionNode node = null;
        if (json != null && !json.isJsonNull()) {
            node = new UserActionNode(json);
            if (!graph.containsVertex(node)) {
                graph.addVertex(node);
            }
        }
        return node;

    }
}