geocluster.GeoclusterComponent.java Source code

Java tutorial

Introduction

Here is the source code for geocluster.GeoclusterComponent.java

Source

package geocluster;
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.lucene.spatial.geometry.DistanceUnits;
import org.apache.lucene.spatial.geometry.FloatLatLng;
import org.apache.lucene.spatial.geometry.LatLng;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.clustering.ClusteringParams;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import ch.hsr.geohash.GeoHash;

/**
 * Provide a plugin for geo-clustering results. 
 * http://localhost:8123/solr/search?facet=true&group.limit=5&geocluster.clusterField=field_place&geocluster.resolution=1221.96875&geocluster.zoomLevel=7&fl=*&geocluster.idField=ss_search_api_id&group.field=f_ss_field_place:geohash_geocluster_index_3&facet.field=f_ss_field_place:geohash_geocluster_index_3&qt=/geocluster&fq=index_id:geocluster_index&geocluster.clusterDistance=33&geocluster.groupField=f_ss_field_place:geohash_geocluster_index_3&geocluster.geohashField=geohashs_field_place:geohash&qf=t_field_place:latlon^1.0&geocluster.geohashLength=3&rows=1000000&start=0&q=*:*&facet.prefix=3_&geocluster.latlonField=t_field_place:latlon&group=true
 */
public class GeoclusterComponent extends SearchComponent implements SolrCoreAware {

    // Geocluster schema.
    public static final String GEOCLUSTER_CENTER = "locs_center";
    public static final String GEOCLUSTER_DOCS = "docs";
    public static final String GEOCLUSTER_DOC_IDS = "sm_doc_ids";
    public static final String GEOCLUSTER_COUNT = "is_count";

    private transient static Logger log = LoggerFactory.getLogger(GeoclusterComponent.class);

    /**
     * Base name for all spell checker query parameters. This name is also used to
     * register this component with SearchHandler.
     */
    public static final String COMPONENT_NAME = "geocluster";
    private NamedList initParams;

    protected boolean initialized = false;

    /**
     * Cluster distance in pixels.
     */
    protected double clusterDistance;

    /**
     * Current zoom level for clustering.
     */
    protected int zoomLevel;

    /**
     * Resolution in meters / pixel based on zoom_level.
     */
    protected double resolution;

    /**
     * Geohash length for clustering by a specified distance in pixels.
     */
    protected int geohashLength;

    /**
     * Field used for clustering, for example field_place.
     */
    protected String clusterField;

    /**
     * Field used for grouping, for example f_ss_field_place:geohash_geocluster_index_3.
     */
    protected String groupField;

    protected String geohashField;

    protected String idField;

    protected String latlonField;

    @Override
    public void prepare(ResponseBuilder rb) throws IOException {
        SolrParams params = rb.req.getParams();
        if (params.getBool(COMPONENT_NAME, false)) {
            // We rely on a docList to access data to cluster upon.
            // TODO: this just doesn't work
            // rb.setNeedDocList( true );
            // Alternative workaround, see getDocList in Grouping.java
            rb.setFieldFlags(SolrIndexSearcher.GET_DOCLIST);

            this.initialized = false;
            try {
                // Parse geocluster specific query parameters.
                // These are provided by GeoclusterSearchApiSolrService.
                this.clusterDistance = Double.parseDouble(params.get("geocluster.clusterDistance"));
                this.zoomLevel = Integer.parseInt(params.get("geocluster.zoomLevel"));
                this.resolution = Double.parseDouble(params.get("geocluster.resolution"));
                this.geohashLength = Integer.parseInt(params.get("geocluster.geohashLength"));
                this.clusterField = params.get("geocluster.clusterField");
                this.groupField = params.get("geocluster.groupField");
                this.geohashField = params.get("geocluster.geohashField");
                this.idField = params.get("geocluster.idField");
                this.latlonField = params.get("geocluster.latlonField");
                this.initialized = true;
            } catch (Exception e) {
            }
        }

        this.clusterDistance = 200;
    }

    @Override
    public void process(ResponseBuilder rb) throws IOException {
        SolrParams params = rb.req.getParams();
        if (!params.getBool(COMPONENT_NAME, false) || !this.initialized) {
            log.info("skipped clustering");
            return;
        }
        log.info("started clustering");
        DocListAndSet results = rb.getResults();
        Map<SolrDocument, Integer> docIdsReverse = new HashMap<SolrDocument, Integer>(results.docList.size());
        SolrDocumentList solrDocList = getSolrDocumentList(results.docList, rb.req, docIdsReverse);

        Map<Integer, SolrDocument> docIds = new HashMap<Integer, SolrDocument>(results.docList.size());

        for (Entry<SolrDocument, Integer> docId : docIdsReverse.entrySet()) {
            docIds.put(docId.getValue(), docId.getKey());
        }

        // Get grouped values.
        NamedList values = rb.rsp.getValues();
        NamedList grouped = (NamedList) values.get("grouped");
        NamedList groupedValue = (NamedList) grouped.get(groupField);
        ArrayList<NamedList> groups = (ArrayList) groupedValue.get("groups");

        // Iterate over grouped values and perform clustering algorithm.
        if (groups != null) {

            // Add all points within the core geohash to a cluster.
            Map<String, SolrDocument> clusterMap = clusterByHashes(docIds, groups);

            // Compare neighbor overlaps.
            neighborCheck(clusterMap);

            // Finalize cluster data.
            SolrDocumentList resultClusters = finalizeClusters(clusterMap);

            // Remove normal grouped docs from response.
            rb.rsp.getValues().remove(rb.rsp.getValues().indexOf("grouped", 0));

            // Add our custom cluster map instead.
            rb.rsp.add("clusters", resultClusters);
        }

        log.info("clustering finished");
    }

    private Map<String, SolrDocument> clusterByHashes(Map<Integer, SolrDocument> docIds,
            ArrayList<NamedList> groups) {
        int size = docIds.size(); // will be less, but no way to calc in advance?
        Map<String, SolrDocument> clusterMap = new HashMap<String, SolrDocument>(size);

        // Add all points within the core geohash to a cluster.
        for (NamedList group : groups) {
            String geohashPrefix = (String) group.get("groupValue");
            log.info("Prefix: " + geohashPrefix);

            SolrDocument cluster = null;
            DocSlice docList = (DocSlice) group.get("doclist");
            DocIterator iterator = docList.iterator();
            while (iterator.hasNext()) {
                Integer docId = iterator.next();
                SolrDocument doc = docIds.get(docId);

                String geohash = (String) doc.getFieldValue(this.geohashField);
                String latlon = (String) doc.getFieldValue(this.latlonField);
                String id = (String) doc.getFieldValue(this.idField);

                // Init cluster
                if (cluster == null) {
                    cluster = initCluster(doc, docId, clusterMap, geohashPrefix, docList);
                    log.info("Parent: " + id + ", geohash: " + geohash + ", latlon: " + latlon);
                } else {
                    addCluster(cluster, doc, docId);
                    log.info("Child : " + id + ", geohash: " + geohash + ", latlon: " + latlon);
                }
            }

            updateCluster(cluster);
        }
        return clusterMap;
    }

    private void neighborCheck(Map<String, SolrDocument> clusterMap) {
        Iterator<Entry<String, SolrDocument>> i = clusterMap.entrySet().iterator();
        /* 
         * TODO: we are always merging the current cluster into the other one
         * if those overlap. this is for coding convenience as we can use the
         * iterator.remove method this way.
         * 
         * actually we should check where in which geohash quadrant the new
         * super cluster will remain and choose which to delete upon that.
         */
        loop: while (i.hasNext()) {
            Entry<String, SolrDocument> clusterEntry = i.next();
            String geohashPrefix = clusterEntry.getKey();
            if (geohashPrefix == null) {
                continue;
            }
            SolrDocument cluster = clusterEntry.getValue();
            log.info("Cluster: key: " + geohashPrefix + ", value: ");
            GeoHash hash = GeoHash.fromGeohashString(geohashPrefix);

            // Get all neighbors to check for.
            GeoHash[] neighbors = GeohashHelper.getAdjacecentNorthWest(hash);
            for (GeoHash neighbor : neighbors) {
                String neighborHashString = neighbor.toBase32();
                if (clusterMap.containsKey(neighborHashString)) {
                    SolrDocument otherCluster = clusterMap.get(neighborHashString);

                    // For every neighbor we check if they overlap and if so we merge.
                    if (shouldCluster(otherCluster, cluster)) {
                        mergeCluster(otherCluster, cluster);
                        i.remove();
                        continue loop; // This cluster is gone, remove and continue.
                    }
                }
            }
        }
    }

    private SolrDocumentList finalizeClusters(Map<String, SolrDocument> clusterMap) {
        SolrDocumentList resultClusters = new SolrDocumentList();
        for (Entry<String, SolrDocument> clusterEntry : clusterMap.entrySet()) {
            String geohashPrefix = clusterEntry.getKey();
            if (geohashPrefix == null) {
                continue;
            }
            SolrDocument cluster = clusterEntry.getValue();
            this.finishCluster(cluster, geohashPrefix);
            resultClusters.add(cluster);
        }
        return resultClusters;
    }

    /**
     * Initialize a cluster.
     */
    private SolrDocument initCluster(SolrDocument doc, Integer docId, Map<String, SolrDocument> clusterMap,
            String geohashPrefix, DocSlice docList) {
        HashMap<Integer, SolrDocument> docs = new HashMap<Integer, SolrDocument>();
        docs.put(docId, doc);
        SolrDocument cluster = new SolrDocument();
        clusterMap.put(geohashPrefix, cluster);
        cluster.addField(GEOCLUSTER_DOCS, docs);
        return cluster;
    }

    /**
     * Add a document to a cluster.
     * 
     * @param clusterMap
     * @param doc
     * @param clusterMap
     */
    private void addCluster(SolrDocument cluster, SolrDocument doc, Integer docId) {
        HashMap<Integer, SolrDocument> docs = (HashMap<Integer, SolrDocument>) cluster
                .getFieldValue(GEOCLUSTER_DOCS);
        docs.put(docId, doc);
    }

    private void updateCluster(SolrDocument cluster) {
        // Calculate center point from all clustered points.
        HashMap<Integer, SolrDocument> docs = (HashMap<Integer, SolrDocument>) cluster
                .getFieldValue(GEOCLUSTER_DOCS);
        Float latMin = null, latMax = null, lonMin = null, lonMax = null;
        for (Entry<Integer, SolrDocument> entry : docs.entrySet()) {
            SolrDocument doc = entry.getValue();
            String latlon = (String) doc.getFieldValue(this.latlonField);
            if (latlon != null) {
                String[] latlonSplit = latlon.split(",");
                float lat = Float.parseFloat(latlonSplit[0]);
                float lon = Float.parseFloat(latlonSplit[1]);
                latMin = latMin == null ? lat : Math.min(latMin, lat);
                latMax = latMax == null ? lat : Math.max(latMax, lat);
                lonMin = lonMin == null ? lon : Math.min(lonMin, lon);
                lonMax = lonMax == null ? lon : Math.max(lonMax, lon);
            }
        }
        try {
            LatLng latlonCenter = new FloatLatLng((latMin + latMax) / 2, (lonMin + lonMax) / 2);
            cluster.put(GEOCLUSTER_CENTER, latlonCenter);
        } catch (Exception e) {
        }
    }

    private boolean shouldCluster(SolrDocument cluster, SolrDocument otherCluster) {
        LatLng latlng = (LatLng) cluster.get(GEOCLUSTER_CENTER);
        LatLng latlngOther = (LatLng) otherCluster.get(GEOCLUSTER_CENTER);
        // Calculate distance in meters.
        double distance = latlng.arcDistance(latlngOther, DistanceUnits.KILOMETERS);
        log.info("distance: " + distance);
        return distance < this.clusterDistance;
    }

    private void mergeCluster(SolrDocument cluster, SolrDocument otherCluster) {
        HashMap<Integer, SolrDocument> otherDocs = (HashMap<Integer, SolrDocument>) otherCluster
                .getFieldValue(GEOCLUSTER_DOCS);
        for (Entry<Integer, SolrDocument> otherEntry : otherDocs.entrySet()) {
            addCluster(cluster, otherEntry.getValue(), otherEntry.getKey());
        }
        // Uddate center.
        LatLng center = (LatLng) cluster.get(GEOCLUSTER_CENTER);
        LatLng otherCenter = (LatLng) otherCluster.get(GEOCLUSTER_CENTER);
        LatLng newCenter = center.calculateMidpoint(otherCenter);
        cluster.setField(GEOCLUSTER_CENTER, newCenter);
    }

    private void finishCluster(SolrDocument cluster, String geohashPrefix) {
        // Replace center with latlng string.
        LatLng center = (LatLng) cluster.get(GEOCLUSTER_CENTER);
        cluster.setField(GEOCLUSTER_CENTER, center.getLat() + "," + center.getLng());
        // Replace docs with ids only.
        HashMap<Integer, SolrDocument> docs = (HashMap<Integer, SolrDocument>) cluster
                .getFieldValue(GEOCLUSTER_DOCS);
        cluster.remove(GEOCLUSTER_DOCS);
        cluster.setField(GEOCLUSTER_DOC_IDS, docs.keySet());
        cluster.addField(GEOCLUSTER_COUNT, docs.size());
        cluster.addField("ss_id", geohashPrefix);
    }

    /**
     * Returns the set of field names to load.
     * Concrete classes can override this method if needed.
     * Default implementation returns null, that is, all stored fields are loaded.
     * @param sreq
     * @return set of field names to load
     */
    protected Set<String> getFieldsToLoad(SolrQueryRequest sreq) {
        // TODO: still complete documents seem to be loaded.
        Set<String> fields = new HashSet<String>();
        fields.add(this.groupField);
        fields.add(this.geohashField);
        fields.add(this.idField);
        fields.add(this.latlonField);
        return fields;
    }

    protected SolrDocumentList getSolrDocumentList(DocList docList, SolrQueryRequest sreq,
            Map<SolrDocument, Integer> docIds) throws IOException {
        return SolrPluginUtils.docListToSolrDocumentList(docList, sreq.getSearcher(), getFieldsToLoad(sreq),
                docIds);
    }

    @Override
    public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
        SolrParams params = rb.req.getParams();
        if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
            return;
        }
        sreq.params.remove(COMPONENT_NAME);
    }

    @Override
    @SuppressWarnings("unchecked")
    public void init(NamedList args) {
        super.init(args);
        this.initParams = args;
    }

    public void inform(SolrCore core) {
    }

    // ///////////////////////////////////////////
    // / SolrInfoMBean
    // //////////////////////////////////////////

    @Override
    public String getDescription() {
        return "A Clustering component";
    }

    @Override
    public String getVersion() {
        return "$Revision$";
    }

    @Override
    public String getSourceId() {
        return "$Id$";
    }

    @Override
    public String getSource() {
        return "$URL$";
    }

}