com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.utility;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.bson.BSONObject;
import org.bson.types.ObjectId;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;

import com.google.gson.Gson;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
import com.ikanow.infinit.e.processing.generic.GenericProcessingController;
import com.ikanow.infinit.e.processing.generic.aggregation.AssociationAggregationUtils;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.MongoException;

public class MongoAssociationFeatureTxfer {
    //___________________________________________________________________________________________________

    // MAIN

    /**
     * @param args: 0,1 is the location of the MongoDB host/port, 2/3 is the location of the ES index host/port
     * @throws MongoException 
     * @throws NumberFormatException 
     * @throws IOException 
     */
    public static void main(String sConfigPath, String sQuery, boolean bDelete, boolean bRebuildIndex, int nSkip,
            int nLimit, String chunksDescription) throws NumberFormatException, MongoException, IOException {

        MongoAssociationFeatureTxfer txferManager = new MongoAssociationFeatureTxfer();

        // Command line processing
        com.ikanow.infinit.e.data_model.Globals
                .setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE);
        if (null != sConfigPath) {
            com.ikanow.infinit.e.data_model.Globals.overrideConfigLocation(sConfigPath);
        }
        if (bRebuildIndex) {
            new GenericProcessingController().InitializeIndex(false, false, true);
        }

        BasicDBObject query = null;
        if (null == sQuery) {
            query = new BasicDBObject();
        } else {
            query = (BasicDBObject) com.mongodb.util.JSON.parse(sQuery);
        }

        if (bDelete) {
            txferManager.doDelete(query, nLimit);
        } else {
            if (null == chunksDescription) {
                txferManager.doTransfer(query, nSkip, nLimit, null);
            } else {
                txferManager.doChunkedTransfer(query, nSkip, nLimit, chunksDescription);
            }
        }
    }

    //___________________________________________________________________________________________________

    // Wrapper for doing transfer in chunks:

    private void doChunkedTransfer(BasicDBObject query, int nSkip, int nLimit, String chunksDescription)
            throws IOException {
        List<BasicDBObject> chunkList = MongoIndexerUtils.getChunks("feature.association", chunksDescription);
        System.out.println("CHUNKS: Found " + chunkList.size() + " chunks");
        //DEBUG
        //System.out.println("Chunklist= " + chunkList);
        for (BasicDBObject chunk : chunkList) {
            BasicDBObject cleanQuery = new BasicDBObject();
            cleanQuery.putAll((BSONObject) query);
            String id = null;
            try {
                id = (String) chunk.remove("$id");
                System.out.println("CHUNK: " + id);
                doTransfer(cleanQuery, 0, 0, chunk);
            } catch (Exception e) {
                System.out.println("FAILED CHUNK: " + id + " ... " + e.getMessage());
            }
        }
    }//TESTED

    //___________________________________________________________________________________________________

    // PROCESSING LOOP (new interface)

    Map<String, SourcePojo> _sourceCache = new HashMap<String, SourcePojo>();

    private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
        ElasticSearchManager elasticManager = null;

        // Initialize the DB:
        DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

        // Initialize the ES (create the index if it doesn't already):

        // 1. Set-up the entity feature index 

        ElasticSearchManager.setDefaultClusterName("infinite-aws");

        // (delete the index)
        //elasticManager = ElasticSearchManager.getIndex("association_index");
        //elasticManager.deleteMe();

        // Create the index if necessary
        String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
                AssociationFeaturePojoIndexMap.Mapping.class);
        Builder localSettings = ImmutableSettings.settingsBuilder();
        localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
        localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
        localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

        elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping,
                localSettings);

        // Get the index (necessary if already created)
        if (null == elasticManager) {
            elasticManager = ElasticSearchManager.getIndex("association_index");
        }

        // Now query the DB:

        DBCursor dbc = null;
        dbc = eventFeatureDB.find(query);
        if (null != chunk) {
            if (chunk.containsField(DbManager.min_)) {
                dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
            }
            if (chunk.containsField(DbManager.max_)) {
                dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
            }
        }
        dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
        if (null == chunk) {
            int nCount = dbc.count() - nSkip;
            if (nCount < 0)
                nCount = 0;
            System.out.println(
                    "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
            if (0 == nCount) { // Nothing to do...
                return;
            }
        }

        List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>();

        int nSynced = 0;

        // Loop over array and invoke the cleansing function for each one
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class);

            // If this table has just been rebuilt from the document then the indexes are all wrong ...
            // recalculate and save
            if ('#' == evt.getIndex().charAt(0)) {
                AssociationPojo singleEvt = new AssociationPojo();
                singleEvt.setEntity1_index(evt.getEntity1_index());
                singleEvt.setEntity2_index(evt.getEntity2_index());
                singleEvt.setVerb_category(evt.getVerb_category());
                singleEvt.setGeo_index(evt.getGeo_index());
                evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt));
                eventFeatureDB
                        .update(new BasicDBObject("_id", dbo.get("_id")),
                                new BasicDBObject(MongoDbManager.set_,
                                        new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())),
                                false, true);
                // (has to be a multi-update even though it's unique because it's sharded on index)
            }

            // Handle groups (system group is: "4c927585d591d31d7b37097a")
            if (null == evt.getCommunityId()) {
                evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
            }
            // Bulk add prep
            events.add(evt);
            nSynced++;

            if (events.size() > 1000) {
                elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events,
                        AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null,
                        true);
                events.clear();
            }
        }
        // End loop over entities

        //write whatevers left
        elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(),
                new AssociationFeaturePojoIndexMap()), "_id", null, true);

        if (null != chunk) {
            System.out.println("Found " + nSynced + " records to sync in chunk");
        }
    }
    //___________________________________________________________________________________________________

    // DELETE DOCUMENTS FROM A QUERY

    private void doDelete(BasicDBObject query, int nLimit) {
        try {
            // Initialize the DB:   
            DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

            DBCursor cur = eventFeatureDB.find(query).limit(nLimit);
            // (this internally works in batches of 1000; just get _id)
            System.out.println("Found " + cur.count() + " records to delete");
            if (nLimit > 0) {
                System.out.println("(limited to " + nLimit + " records)");
            }

            ArrayList<AssociationFeaturePojo> events = new ArrayList<AssociationFeaturePojo>();
            LinkedList<String> eventIds = new LinkedList<String>();
            while (cur.hasNext()) {
                AssociationFeaturePojo event = AssociationFeaturePojo.fromDb(cur.next(),
                        AssociationFeaturePojo.class);
                events.add(event);
                eventIds.add(
                        new StringBuffer(event.getIndex()).append(":").append(event.getCommunityId()).toString());
                eventFeatureDB.remove(new BasicDBObject("index", event.getIndex()));
            }
            ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("association_index");
            elasticManager.bulkDeleteDocuments(eventIds);

        } catch (NumberFormatException e) {
            e.printStackTrace();
        } catch (MongoException e) {
            e.printStackTrace();
        }
    }
}