dbscan.DBScanReducer.java Source code

Introduction

Here is the source code for dbscan.DBScanReducer.java
Source

/*
 * Copyright 2012 Department of Computer Science - University of Turin.
 * Author: Marco Sero
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package dbscan;

// Java
import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;

// Mongo
import org.bson.*;
import org.bson.types.*;
import com.mongodb.hadoop.io.*;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.DBCollection;
import com.mongodb.DB;

// Hadoop
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;

// jcoord
import uk.me.jstott.jcoord.LatLng;

/**
 * This class realizes the reduce function.
 * 
 * @author Marco Sero
 *
 */
public class DBScanReducer extends Reducer<Text, BSONWritable, Text, BSONWritable> {

    /**
     * MongoDB's stuff
     */
    private Mongo m;
    private DB db;
    private DBCollection collection;

    /**
     * Radius, in kilometers, to search neighbors for every point.
     */
    private static final double radius = 50; // km

    /**
     * Minimum number of points that nearPoints must have to emit a new cluster.
     */
    private static final int minPointsToCreateCluster = 30;

    /**
     * In a few special occasions, during the cycles of the reduce, could happen
     * that the new cluster location, calculated as the average of coordinates,
     * is significantly different from the previous location.
     * In this case, the cluster analyzed is not an event, but a common
     * Twitter's trending topic on a national or international scale.
     * We are only interest on event with an accurate location.
     * The scope of this variable is to denote if the new event location is
     * *very* different from previous one.
     */
    private static final int MAX_DISTANCE_OFFSET_NEW_CLUSTER_LOCATION = 100; // km

    // Connect to Mongo
    DBScanReducer() throws java.net.UnknownHostException {
        super();
        m = new Mongo();
        db = m.getDB("p");
        boolean auth = db.authenticate("username", "password".toCharArray());
        if (!auth)
            throw new RuntimeException("Login error");
        collection = db.getCollection("points");
    }

    /**
     * The reduce function has in input an 'array' of clusters with the same key.
     * Its job is to aggregate these clusters and to analyze their neighborhood
     * to merge all points into a unique cluster.
     * This method can be called much times, virtually every time the map function emits
     * a new cluster with a key equals to another cluster.
     *
     * @param pKey the key of the clusters in input
     * @param pValues the array iterable of clusters (type of these objects is BSONWritable)
     * @param pContext the context in which map-reduce works
     */
    @Override
    public void reduce(final Text eventKey, final Iterable<BSONWritable> eventValues, final Context eventContext)
            throws IOException, InterruptedException {

        //System.out.println("Reducing clusters with key : " + pKey +"...");

        // get the iterator
        Iterator<BSONWritable> iterator = eventValues.iterator();

        // alloc *new* cluster
        BSONWritable newCluster = new BSONWritable();

        int numPoints = 0;
        int k = 0;
        float avgLat = 0;
        float avgLon = 0;
        int numPointsAnalyzed = 0;
        ;

        // start loop for analyze every cluster
        while (iterator.hasNext()) {

            BSONObject aCluster = iterator.next();

            // at the first to loop, initialize the *new* cluster
            if (k == 0) {
                newCluster.put("loc", aCluster.get("loc"));
                newCluster.put("createdAt", aCluster.get("createdAt"));
                newCluster.put("hashtag", aCluster.get("hashtag"));
                newCluster.put("isEvent", aCluster.get("isEvent"));
            }

            // add points to *new* cluster
            numPoints += (Integer) aCluster.get("numPoints");

            // put all neighbor points to a ConcurrentHashMap
            Map<ObjectId, BSONObject> tmp = (Map<ObjectId, BSONObject>) aCluster.get("neighborPoints");
            Map<ObjectId, BSONObject> neighborPoints = new ConcurrentHashMap<ObjectId, BSONObject>();
            neighborPoints.putAll(tmp);

            // start loop for neighbor points         
            int i = 0;
            for (Iterator iteratorNeighborPoints = neighborPoints.entrySet().iterator(); iteratorNeighborPoints
                    .hasNext();) {

                Map.Entry<ObjectId, BSONObject> p = (Entry<ObjectId, BSONObject>) iteratorNeighborPoints.next();

                // needs to re-query MongoDB because the point now could be visited
                // by, for example, a map thread concurrent to this reduce thread
                BSONObject point = collection.findOne(new BasicDBObject("_id", p.getValue().get("_id")));
                boolean pointModified = false;

                if (point != null) {
                    if ((Boolean) point.get("visited") == false) {

                        // mark as visited
                        point.put("visited", true);
                        pointModified = true;

                        // find near points
                        BasicDBObject findNearPoints = new BasicDBObject();
                        findNearPoints.put("loc", new BasicDBObject("$within", new BasicDBObject("$center",
                                new Object[] { point.get("loc"), new Double(radius / 111.12) })));
                        findNearPoints.put("hashtag", point.get("hashtag"));
                        DBCursor nearPoints = collection.find(findNearPoints);

                        if (nearPoints.size() >= minPointsToCreateCluster) {
                            // increase performance by adding only points unvisited OR unclusterized
                            // two query BUT much less points to loop
                            findNearPoints.put("$or", new BasicDBObject[] { new BasicDBObject("visited", false),
                                    new BasicDBObject("clusterized", false) });
                            nearPoints = collection.find(findNearPoints);

                            toMap(neighborPoints, nearPoints.toArray());
                        }

                        // refer to null to free a bit of memory
                        findNearPoints = null;
                        nearPoints = null;

                    } // end if visited == false

                    // add point to cluster
                    if ((Boolean) point.get("clusterized") == false) {
                        // add the point to cluster
                        point.put("clusterized", true);
                        pointModified = true;
                        numPoints++;
                    }

                    // update new point in MongoDB
                    if (pointModified)
                        collection.findAndModify(new BasicDBObject("_id", point.get("_id")),
                                new BasicDBObject(point.toMap()));

                    // update average location
                    if (((BasicBSONObject) point.get("loc")).get("lat") instanceof Double)
                        avgLat += ((Double) ((BasicBSONObject) point.get("loc")).get("lat")).floatValue();
                    else
                        avgLat += ((Integer) ((BasicBSONObject) point.get("loc")).get("lat")).floatValue();
                    if (((BasicBSONObject) point.get("loc")).get("lon") instanceof Double)
                        avgLon += ((Double) ((BasicBSONObject) point.get("loc")).get("lon")).floatValue();
                    else
                        avgLon += ((Integer) ((BasicBSONObject) point.get("loc")).get("lon")).floatValue();

                    point = null;
                    i++;
                    numPointsAnalyzed++;
                }
            } // end loop for neighbor points
            k++;

            aCluster = null;
            neighborPoints = null;
            System.gc();

        } // end loop for clusters

        if (numPointsAnalyzed > 0) {
            // update average location of new cluster with the weighted average
            // of points analyzed
            avgLat = avgLat / (float) numPointsAnalyzed;
            avgLon = avgLon / (float) numPointsAnalyzed;

            // if the location of analyzed points is significantly different from
            // the old cluster location, then that cluster is not an event!
            BSONObject loc = (BSONObject) newCluster.get("loc");
            LatLng oldLatLon = new LatLng((Double) loc.get("lat"), (Double) loc.get("lon"));
            LatLng newLatLon = new LatLng(avgLat, avgLon);
            double distance = oldLatLon.distance(newLatLon);

            if (distance < MAX_DISTANCE_OFFSET_NEW_CLUSTER_LOCATION)
                // mark as event
                newCluster.put("isEvent", true);
            else
                // mark as no-event
                newCluster.put("isEvent", false);

            // update new position (only if is valid)
            if (avgLat >= -90.0f && avgLat <= 90.0f && avgLon >= -180.0f && avgLon <= 180.0f) {
                DecimalFormat df = new DecimalFormat("##.######");
                Map<String, Float> newLoc = new TreeMap<String, Float>();
                newLoc.put("lat", Float.parseFloat(df.format(avgLat)));
                newLoc.put("lon", Float.parseFloat(df.format(avgLon)));
                newCluster.put("loc", newLoc);
            }

        }

        // update new cluster object
        newCluster.put("numPoints", numPoints);
        newCluster.put("neighborPoints", new HashMap<ObjectId, BSONObject>());

        // write to context if and only if the new cluster has enought points
        if (numPoints > 30)
            eventContext.write(eventKey, newCluster);

        newCluster = null;

        // IN CASE OF MEMORY PROBLEMS: force garbage collector
        // it could not be elegant and is often not recommended,
        // but it works
        System.gc();

    }

    /**
     * Convert a list of DBObject into a Map
     * @param dest the destionation Map
     * @param list the input List
     */
    public static final void toMap(Map dest, List<DBObject> list) {

        // Iterate through each element in source and put it in dest. 
        int j = 0;
        for (int i = 0; i < list.size(); ++i) {
            if (!dest.containsKey(list.get(i).get("_id"))) {
                dest.put(list.get(i).get("_id").toString(), list.get(i));
                j++;
            }
        }
    }

}