Java tutorial
/******************************************************************************* * Copyright (c) 2014 OSSMETER Partners. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * James Williams - Implementation. *******************************************************************************/ package org.ossmeter.platform.mining.msr14.stats; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Random; import org.ossmeter.platform.mining.msr14.model.Biodiversity; import org.ossmeter.platform.mining.msr14.model.User; import com.mongodb.Mongo; import com.mongodb.ServerAddress; public class KMeansClustering { public static void main(String[] args) { try { Mongo mongo = new Mongo(new ServerAddress("localhost", 12345)); Biodiversity bio = new Biodiversity(mongo.getDB("biodiversity_1")); bio.setClearPongoCacheOnSync(true); List<User> users = new ArrayList<User>(); // Massively inefficient int usss = 0; for (User u : bio.getUsers()) { if (u.getCommitAdditions() == 0 && u.getCommitCount() == 0 && u.getCommitDeletions() == 0 && u.getNumberOfCommitComments() == 0 && u.getNumberOfIssues() == 0 && u.getNumberOfIssueComments() == 0 && u.getNumberOfPullRequests() == 0 && u.getNumberOfPullRequestComments() == 0) { continue; } users.add(u); usss++; if (usss > 5000) break; } KMeansClustering kmeans = new KMeansClustering(); HashMap<Centroid, List<User>> clusters = kmeans.compute(4, users); FileWriter writer = new FileWriter(new File("groups.csv")); writer.write("group,numberOfCommits,numberOfChanges,numberOfAdditions,numberOfDeletions," + "numberOfCommitsAsAuthor,numberOfCommitsAsCommitter,numberOfIssues,numberOfIssueComments," + "numberOfPullRequests,numberOfPullRequestComments,numberOfCommitComments,numberOfForks+\n"); int group = 0; for (Centroid c : clusters.keySet()) { List<User> us = clusters.get(c); for (User u : us) { writer.write(group + ","); writer.write(u.getCommitCount() + ","); writer.write(u.getCommitTotalChanges() + ","); writer.write(u.getCommitAdditions() + ","); writer.write(u.getCommitDeletions() + ","); writer.write(u.getCommitsAsAuthor() + ","); writer.write(u.getCommitsAsCommitter() + ","); writer.write(u.getNumberOfIssues() + ","); writer.write(u.getNumberOfIssueComments() + ","); writer.write(u.getNumberOfPullRequests() + ","); writer.write(u.getNumberOfPullRequestComments() + ","); writer.write(u.getNumberOfCommitComments() + ","); writer.write(u.getNumberOfForks() + "\n"); } group++; writer.flush(); } writer.close(); } catch (UnknownHostException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public HashMap<Centroid, List<User>> compute(int k, List<User> users) { HashMap<Centroid, List<User>> clusters = null; List<Centroid> centroids = createInitialCentroids(k, users); System.out.println("Initial centroids:"); for (Centroid c : centroids) System.out.println(c.toString()); int maxIterations = 15; while (maxIterations-- > 0) { clusters = formClusters(centroids, users); System.out.println("Centroids:"); for (Centroid c : clusters.keySet()) { System.out.println(c.toString() + " size: " + clusters.get(c).size()); } List<Centroid> newCentroids = recomputeCentroids(clusters); if (compareCentroids(centroids, newCentroids)) { break; } centroids = newCentroids; } return clusters; } /** * Randomly selects k users to be the initial centroids. * @param k * @param users * @return */ protected List<Centroid> createInitialCentroids(int k, List<User> users) { Random random = new Random(); List<Centroid> centroids = new ArrayList<Centroid>(); while (centroids.size() < k) { User user = users.get(random.nextInt(users.size())); Centroid c = new Centroid(); c.numberOfCommits = user.getCommitCount(); c.numberOfChanges = user.getCommitTotalChanges(); c.numberOfAdditions = user.getCommitAdditions(); c.numberOfDeletions = user.getCommitDeletions(); c.numberOfCommitsAsAuthor = user.getCommitsAsAuthor(); c.numberOfCommitsAsCommitter = user.getCommitsAsCommitter(); c.numberOfIssues = user.getNumberOfIssues(); c.numberOfIssueComments = user.getNumberOfIssueComments(); c.numberOfPullRequests = user.getNumberOfPullRequests(); c.numberOfPullRequestComments = user.getNumberOfPullRequestComments(); c.numberOfCommitComments = user.getNumberOfCommitComments(); c.numberOfForks = user.getNumberOfForks(); centroids.add(c); } return centroids; } protected HashMap<Centroid, List<User>> formClusters(List<Centroid> centroids, List<User> users) { HashMap<Centroid, List<User>> clusters = new HashMap<Centroid, List<User>>(); for (User u : users) { double distanceToClosest = Double.MAX_VALUE; Centroid closest = null; for (Centroid c : centroids) { double distance = calculateDistanceToCentroid(u, c); if (distance < distanceToClosest) { distanceToClosest = distance; closest = c; } } if (clusters.containsKey(closest)) { clusters.get(closest).add(u); } else { List<User> us = new ArrayList<User>(); us.add(u); clusters.put(closest, us); } } return clusters; } protected List<Centroid> recomputeCentroids(HashMap<Centroid, List<User>> clusters) { List<Centroid> newCentroids = new ArrayList<Centroid>(); for (Centroid c : clusters.keySet()) { List<User> points = clusters.get(c); c = new Centroid(); for (User user : points) { c.numberOfCommits += user.getCommitCount(); c.numberOfChanges += user.getCommitTotalChanges(); c.numberOfAdditions += user.getCommitAdditions(); c.numberOfDeletions += user.getCommitDeletions(); c.numberOfCommitsAsAuthor += user.getCommitsAsAuthor(); c.numberOfCommitsAsCommitter += user.getCommitsAsCommitter(); c.numberOfIssues += user.getNumberOfIssues(); c.numberOfIssueComments += user.getNumberOfIssueComments(); c.numberOfPullRequests += user.getNumberOfPullRequests(); c.numberOfPullRequestComments += user.getNumberOfPullRequestComments(); c.numberOfCommitComments += user.getNumberOfCommitComments(); c.numberOfForks += user.getNumberOfForks(); } c.numberOfCommits /= points.size(); c.numberOfChanges /= points.size(); c.numberOfAdditions /= points.size(); c.numberOfDeletions /= points.size(); c.numberOfCommitsAsAuthor /= points.size(); c.numberOfCommitsAsCommitter /= points.size(); c.numberOfIssues /= points.size(); c.numberOfIssueComments /= points.size(); c.numberOfPullRequests /= points.size(); c.numberOfPullRequestComments /= points.size(); c.numberOfCommitComments /= points.size(); c.numberOfForks /= points.size(); newCentroids.add(c); } return newCentroids; } protected double calculateDistanceToCentroid(User user, Centroid centroid) { double result = Math.sqrt((centroid.numberOfCommits - user.getCommitCount()) ^ 2 + (centroid.numberOfChanges - user.getCommitTotalChanges()) ^ 2 + (centroid.numberOfAdditions - user.getCommitAdditions()) ^ 2 + (centroid.numberOfDeletions - user.getCommitDeletions()) ^ 2 + (centroid.numberOfCommitsAsAuthor - user.getCommitsAsAuthor()) ^ 2 + (centroid.numberOfCommitsAsCommitter - user.getCommitsAsCommitter()) ^ 2 + (centroid.numberOfIssues - user.getNumberOfIssues()) ^ 2 + (centroid.numberOfIssueComments - user.getNumberOfIssueComments()) ^ 2 + (centroid.numberOfPullRequests - user.getNumberOfPullRequests()) ^ 2 + (centroid.numberOfPullRequestComments - user.getNumberOfPullRequestComments()) ^ 2 + (centroid.numberOfCommitComments - user.getNumberOfCommitComments()) ^ 2 + (centroid.numberOfForks - user.getNumberOfForks()) ^ 2); if (Double.isNaN(result)) result = 0; return result; } /** * * @param old * @return true if they match. */ protected boolean compareCentroids(List<Centroid> prev, List<Centroid> curr) { List<Integer> foundPrevs = new ArrayList<Integer>(); for (Centroid c : curr) { boolean found = false; for (int i = 0; i < prev.size(); i++) { if (foundPrevs.contains(i)) continue; Centroid d = prev.get(i); if (d.numberOfAdditions == c.numberOfAdditions && d.numberOfChanges == c.numberOfChanges && d.numberOfCommitComments == c.numberOfCommitComments && d.numberOfCommits == c.numberOfCommits && d.numberOfCommitsAsAuthor == c.numberOfCommitsAsAuthor && d.numberOfCommitsAsCommitter == c.numberOfCommitsAsCommitter && d.numberOfDeletions == c.numberOfDeletions && d.numberOfForks == c.numberOfForks && d.numberOfIssueComments == c.numberOfIssueComments && d.numberOfIssues == c.numberOfIssues && d.numberOfPullRequestComments == c.numberOfPullRequestComments && d.numberOfPullRequests == c.numberOfPullRequests) { found = true; foundPrevs.add(i); break; } } if (!found) return false; } return true; } class Centroid { int numberOfCommits; int numberOfChanges; int numberOfAdditions; int numberOfDeletions; int numberOfCommitsAsAuthor; int numberOfCommitsAsCommitter; int numberOfIssues; int numberOfIssueComments; int numberOfPullRequests; int numberOfPullRequestComments; int numberOfCommitComments; int numberOfForks; @Override public String toString() { return "Centroid: [" + numberOfCommits + ", " + numberOfChanges + ", " + numberOfAdditions + ", " + numberOfDeletions + ", " + numberOfCommitsAsAuthor + ", " + numberOfCommitsAsCommitter + ", " + numberOfIssues + ", " + numberOfIssueComments + ", " + numberOfPullRequests + ", " + numberOfPullRequestComments + ", " + numberOfCommitComments + ", " + numberOfForks + "]"; } } }