TwitterClustering.java Source code

Java tutorial

Introduction

Here is the source code for TwitterClustering.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Stream;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;

/**
 *
 * @author sreesha.n
 */
public class TwitterClustering {

    /**
     * @param args
     *            the command line arguments
     */
    public static <T> Set<T> union(Set<T> setA, Set<T> setB) {
        Set<T> tmp = new TreeSet<T>(setA);
        tmp.addAll(setB);
        return tmp;
    }

    public static <T> Set<T> intersection(Set<T> setA, Set<T> setB) {
        Set<T> tmp = new TreeSet<T>();
        for (T x : setA) {
            if (setB.contains(x)) {
                tmp.add(x);
            }
        }
        return tmp;
    }

    public static float jaccardDistance(Set<String> a, Set<String> b) {

        if (a.size() == 0 || b.size() == 0) {
            return 0;
        }

        Set<String> unionXY = new HashSet<String>(a);
        unionXY.addAll(b);

        Set<String> intersectionXY = new HashSet<String>(a);
        intersectionXY.retainAll(b);
        float retValue = (float) intersectionXY.size() / (float) unionXY.size();
        return retValue;

    }

    public static <K, V extends Comparable<? super V>> Map<K, V> sortByValue(Map<K, V> map) {
        Map<K, V> result = new LinkedHashMap<>();
        Stream<Map.Entry<K, V>> st = map.entrySet().stream();

        st.sorted(Comparator.comparing(e -> e.getValue())).forEach(e -> result.put(e.getKey(), e.getValue()));

        return result;
    }

    public static void main(String[] args) throws FileNotFoundException, IOException {
        // TODO code application logic here

        File outFile = new File(args[3]);
        Scanner s = new Scanner(new File(args[1])).useDelimiter(",");
        JSONParser parser = new JSONParser();
        Set<Cluster> clusterSet = new HashSet<Cluster>();
        HashMap<String, Tweet> tweets = new HashMap();
        FileWriter fw = new FileWriter(outFile.getAbsoluteFile());
        BufferedWriter bw = new BufferedWriter(fw);

        // init
        try {

            Object obj = parser.parse(new FileReader(args[2]));

            JSONArray jsonArray = (JSONArray) obj;

            for (int i = 0; i < jsonArray.size(); i++) {

                Tweet twt = new Tweet();
                JSONObject jObj = (JSONObject) jsonArray.get(i);
                String text = jObj.get("text").toString();

                long sum = 0;
                for (int y = 0; y < text.toCharArray().length; y++) {

                    sum += (int) text.toCharArray()[y];
                }

                String[] token = text.split(" ");
                String tID = jObj.get("id").toString();

                Set<String> mySet = new HashSet<String>(Arrays.asList(token));
                twt.setAttributeValue(sum);
                twt.setText(mySet);
                twt.setTweetID(tID);
                tweets.put(tID, twt);

            }

            // preparing initial clusters
            int i = 0;
            while (s.hasNext()) {
                String id = s.next();// id
                Tweet t = tweets.get(id.trim());
                clusterSet.add(new Cluster(i + 1, t, new LinkedList()));
                i++;
            }

            Iterator it = tweets.entrySet().iterator();

            for (int l = 0; l < 2; l++) { // limit to 25 iterations

                while (it.hasNext()) {
                    Map.Entry me = (Map.Entry) it.next();

                    // calculate distance to each centroid
                    Tweet p = (Tweet) me.getValue();
                    HashMap<Cluster, Float> distMap = new HashMap();

                    for (Cluster clust : clusterSet) {

                        distMap.put(clust, jaccardDistance(p.getText(), clust.getCentroid().getText()));
                    }

                    HashMap<Cluster, Float> sorted = (HashMap<Cluster, Float>) sortByValue(distMap);

                    sorted.keySet().iterator().next().getMembers().add(p);

                }

                // calculate new centroid and update Clusterset
                for (Cluster clust : clusterSet) {

                    TreeMap<String, Long> tDistMap = new TreeMap();

                    Tweet newCentroid = null;
                    Long avgSumDist = new Long(0);
                    for (int j = 0; j < clust.getMembers().size(); j++) {

                        avgSumDist += clust.getMembers().get(j).getAttributeValue();
                        tDistMap.put(clust.getMembers().get(j).getTweetID(),
                                clust.getMembers().get(j).getAttributeValue());
                    }
                    if (clust.getMembers().size() != 0) {
                        avgSumDist /= (clust.getMembers().size());
                    }

                    ArrayList<Long> listValues = new ArrayList<Long>(tDistMap.values());

                    if (tDistMap.containsValue(findClosestNumber(listValues, avgSumDist))) {
                        // found closest
                        newCentroid = tweets
                                .get(getKeyByValue(tDistMap, findClosestNumber(listValues, avgSumDist)));
                        clust.setCentroid(newCentroid);
                    }

                }

            }
            // create an iterator
            Iterator iterator = clusterSet.iterator();

            // check values
            while (iterator.hasNext()) {

                Cluster c = (Cluster) iterator.next();
                bw.write(c.getId() + "\t");
                System.out.print(c.getId() + "\t");

                for (Tweet t : c.getMembers()) {
                    bw.write(t.getTweetID() + ", ");
                    System.out.print(t.getTweetID() + ",");

                }
                bw.write("\n");
                System.out.println("");
            }

            System.out.println("");

            System.out.println("SSE " + sumSquaredErrror(clusterSet));

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            bw.close();
            fw.close();
        }
    }

    public static double sumSquaredErrror(Set<Cluster> clusterSet) {

        double sse = 0;

        for (Cluster clust : clusterSet) {

            for (Tweet p : clust.getMembers()) {

                double dist = jaccardDistance(p.getText(), clust.getCentroid().getText());
                sse += dist * dist;
            }

        }
        return sse;
    }

    public static Long findClosestNumber(List list, Long num) {
        if (list.size() > 0) { // Check list does not empty
            Long smaller = (Long) Collections.min(list); // get min number from
            // the list
            Long larger = (Long) Collections.max(list); // get max number from
            // the list

            for (int i = 0; i < list.size(); i++) { // Traverse list
                if (num == (Long) list.get(i)) // if find the passed number in
                // the list
                {
                    return num; // than return num
                }
                if (num > (Long) list.get(i) && smaller < (Long) list.get(i)) // find
                // nearest
                // smaller
                {
                    smaller = (Long) list.get(i);
                }
                if (num < (Long) list.get(i) && larger > (Long) list.get(i)) // find
                // nearest
                // larger
                {
                    larger = (Long) list.get(i);
                }
            }
            return (num - smaller < larger - num ? smaller : larger); // return
            // closest
            // number
        }
        return new Long(0);
    }

    public static <T, E> T getKeyByValue(Map<T, E> map, E value) {
        for (Entry<T, E> entry : map.entrySet()) {
            if (Objects.equals(value, entry.getValue())) {
                return entry.getKey();
            }
        }
        return null;
    }

}