com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager.java Source code

Introduction

Here is the source code for com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager.java
Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.generic.synchronization;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.action.search.SearchRequestBuilder;
import org.elasticsearch.index.get.GetField;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.sort.SortOrder;

import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;

public class SynchronizationManager {

    private static Logger logger = Logger.getLogger(SynchronizationManager.class);
    private static boolean bKillMeNow = false;

    public static void killMe() {
        bKillMeNow = true;
    }

    private List<SourcePojo> sources = null;

    public void setSources(List<SourcePojo> sources) {
        this.sources = sources;
    }

    /**
     * Does the DB sync, pulls all mongo docs that occured from the
     * cleanseStartTime and source and makes sure they are in the search db.
     * 
     * @param lastCleanse 1 hour before this harvester started
     * @param sources list of sources we are syncing
     * @return The number of errors fixed (docs deleted)
     */
    // DON'T USE THIS UNTIL REWRITTEN - IT SHOULD TRANSFER DOCS ACROSS, NOT LEAVE THEM ALONE
    @Deprecated
    public int syncDB(long cleanseStartTime, Set<String> dbCache) {
        dbCache.clear();

        int fixcount = 0;
        DBCollection contentDb = DbManager.getDocument().getContent();
        DBCollection documentDb = DbManager.getDocument().getMetadata();
        StoreAndIndexManager storeManager = new StoreAndIndexManager();

        for (SourcePojo sp : sources) {
            // Don't combine the sources (apart from unusual multi-community case), because
            // that prevents you from using the compound sourceKey/_id index

            List<String> sourceKeyList = new ArrayList<String>();
            sourceKeyList.addAll(sp.getDistributedKeys());

            try {
                List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
                //FIRST DO ALL NEW FEEDS
                BasicDBObject query = new BasicDBObject();
                query.put(DocumentPojo._id_,
                        new BasicDBObject(MongoDbManager.gt_, new ObjectId((int) (cleanseStartTime / 1000), 0, 0))); // time aspect
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect
                BasicDBObject queryFields = new BasicDBObject();
                queryFields.append(DocumentPojo.url_, 1);
                queryFields.append(DocumentPojo.index_, 1);
                queryFields.append(DocumentPojo.sourceKey_, 1);

                DBCursor cur = documentDb.find(query, queryFields).batchSize(100);
                ElasticSearchManager esm = null;
                ElasticSearchManager esm_base = ElasticSearchManager.getIndex("document_index");
                String sIndex = null;

                while (cur.hasNext()) {
                    if (bKillMeNow) {
                        return fixcount;
                    }

                    DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class);
                    if (null != doc.getId()) {
                        dbCache.add(doc.getId().toString());
                    }

                    // Get index of doc to check in:
                    String sNewIndex = doc.getIndex();
                    if (null == sNewIndex) {
                        sIndex = null;
                        esm = esm_base;
                    } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
                        sIndex = sNewIndex;
                        if (sNewIndex.equals("document_index")) {
                            esm = esm_base;
                        } else {
                            esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
                        }
                    }

                    //Compare mongo doc to search doc
                    Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_);
                    if (null == results || results.isEmpty()) {
                        //either too many entries (duplicates) or no entry
                        //delete this doc from both
                        logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey()
                                + " not found in search (or duplicate)");
                        docs_to_remove.add(doc);
                        documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
                        BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
                        contentQ.put(CompressedFullTextPojo.sourceKey_,
                                new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
                        contentDb.remove(contentQ);
                        fixcount++;
                    }
                } //end loop over new docs for this source
                storeManager.removeFromSearch(docs_to_remove);

                //NOW VERIFY ALL OLD FEEDS
                int iteration = 1;
                boolean removedAll = true;
                docs_to_remove.clear();
                while (removedAll) {
                    int rows = iteration * iteration * 10; //10x^2 exponentially check more docs
                    int oldfixes = 0;
                    BasicDBObject queryOLD = new BasicDBObject();
                    queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect
                    BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1);

                    DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows);
                    while (curOLD.hasNext()) {
                        DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class);
                        if (null != doc.getId()) {
                            dbCache.add(doc.getId().toString());
                        }

                        // Get index of doc to check in:
                        String sNewIndex = doc.getIndex();
                        if (null == sNewIndex) {
                            sIndex = null;
                            esm = esm_base;
                        } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
                            sIndex = sNewIndex;
                            if (sNewIndex.equals("document_index")) {
                                esm = esm_base;
                            } else {
                                esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
                            }
                        }

                        //Compare mongo doc to search doc
                        Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_);
                        if (null == results || results.isEmpty()) {
                            //either too many entries (duplicates) or no entry
                            //delete this doc from both
                            logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey()
                                    + " not found in search (or duplicate)");
                            docs_to_remove.add(doc);
                            documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
                            contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl()));
                            fixcount++;
                            oldfixes++;
                        }
                    }
                    if (oldfixes != rows)
                        removedAll = false;
                } //(end loop over old docs for this source)
                storeManager.removeFromSearch(docs_to_remove);
            } catch (Exception e) {
                // If an exception occurs log the error
                logger.error("Exception Message: " + e.getMessage(), e);
            }
        }
        return fixcount;
    }//TESTED (unchanged from "tested" Beta version)

    /**
     * Does the DB sync, pulls all solr docs that occured from the
     * cleanseStartTime and source and makes sure they are in the mongo db.
     * 
     * @param lastCleanse 1 hour before this harvester started
     * @param sources list of sources we are syncing
     * @return The number of errors fixed (docs deleted)
     */
    //TODO INF-2239 ... lol fail if syncDB isn't called then dbCache is empty and everything gets deleted...
    public int syncSearch(long cleanseStartTime, Set<String> dbCache) {
        int fixcount = 0;
        StoreAndIndexManager storeManager = new StoreAndIndexManager();

        // NO LONGER NEEDED, HAVE CACHE (EXCEPT IN ONE PLACE, THE "OLD DOCS" CHECK)
        DBCollection documentDb = DbManager.getDocument().getMetadata();
        BasicDBObject queryFields = new BasicDBObject(); // (ie just _id, basically only need to know if it exists)
        try {
            //get solr entries from last cleanse point   
            int source_index = 0;
            int source_count = sources.size();
            for (SourcePojo sp : sources) {
                if (bKillMeNow) {
                    return fixcount;
                }
                List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();

                // Get all indexes this source might use:
                StringBuffer sb = new StringBuffer("document_index");
                for (ObjectId sCommunityId : sp.getCommunityIds()) {
                    sb.append(",doc_").append(sCommunityId.toString());
                }
                sb.append("/document_index");

                ElasticSearchManager esm = ElasticSearchManager.getIndex(sb.toString());

                SearchRequestBuilder searchOptions = esm.getSearchOptions();
                BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
                boolQuery.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
                boolQuery.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey()));
                searchOptions.setSize(200); // (note this is multiplied by the number of primary shards)
                searchOptions.setSearchType(SearchType.SCAN);
                searchOptions.setScroll("10m");
                SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
                String scrollId = rsp.getScrollId();
                int nSkip = 0;

                for (;;) // Until no more hits
                {
                    rsp = esm.doScrollingQuery(scrollId, "10m");
                    SearchHit[] docs = rsp.getHits().getHits();
                    scrollId = rsp.getScrollId();

                    if ((null == docs) || (0 == docs.length)) {
                        break;
                    }
                    if (docs.length > 100) { // just display large checks)
                        logger.info("Checking ES docs for large source=" + sp.getKey() + " source: " + source_index
                                + "/" + source_count + " from " + nSkip + " to " + (nSkip + docs.length));
                    }

                    //Check all solr docs against mongodb

                    for (SearchHit hit : docs) {
                        String idStr = hit.getId();
                        boolean found = true; //(fail closed!)
                        if (null == dbCache) {
                            //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
                            ObjectId id = new ObjectId(idStr);
                            BasicDBObject query = new BasicDBObject(DocumentPojo._id_, id);
                            query.put(DocumentPojo.sourceKey_, sp.getDistributedKeyQueryTerm()); // (ensures uses only the right shard)
                            DBObject dbo = documentDb.findOne(query, queryFields);
                            found = (dbo != null);
                        } //TESTED
                        else {
                            found = dbCache.contains(idStr);
                        } //TESTED
                        if (!found) {
                            ObjectId id = new ObjectId(idStr);
                            DocumentPojo doc = new DocumentPojo();
                            doc.setId(id);
                            doc.setIndex(hit.getIndex() + "/document_index");
                            docs_to_remove.add(doc);
                            logger.info("db sync removing doc: " + id + "/" + hit.getIndex() + "/" + source_index
                                    + " not found in mongo");
                            fixcount++;
                        } // end if not found
                    } // end loop over docs to check

                    nSkip += docs.length;
                } // until no more hits
                if (!docs_to_remove.isEmpty()) {
                    storeManager.removeFromSearch(docs_to_remove);
                    docs_to_remove.clear();
                }

                //CHECK OLD FEEDS 10 at atime
                int iteration = 1;
                boolean removedAll = true;
                while (removedAll) {
                    int rows = iteration * iteration * 10;//exponential scaling 10x^2
                    iteration++;
                    int oldfixes = 0;

                    //get old docs from es
                    SearchRequestBuilder searchOptionsOLD = esm.getSearchOptions();
                    BoolQueryBuilder boolQueryOLD = QueryBuilders.boolQuery();
                    boolQueryOLD.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
                    boolQueryOLD.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey()));
                    searchOptionsOLD.addSort(DocumentPojo.created_, SortOrder.ASC);
                    searchOptionsOLD.setSize(rows);
                    SearchResponse rspOLD = esm.doQuery(boolQueryOLD, searchOptionsOLD);
                    SearchHit[] docsOLD = rspOLD.getHits().getHits();

                    //Check all solr docs against mongodb

                    for (SearchHit hit : docsOLD) {
                        String idStr = hit.getId();
                        boolean found = true;
                        if (null == dbCache) {
                            //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
                            ObjectId id = new ObjectId(idStr);
                            BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
                            DBObject dbo = documentDb.findOne(queryOLD, queryFields);
                            found = (dbo != null);
                        } //TESTED
                        else {
                            found = dbCache.contains(idStr);
                        } //TESTED
                        if (!found) {
                            // Also need to check the DB since dbCache is not guaranteed to be populated with the same
                            // number of "final" docs
                            ObjectId id = new ObjectId(idStr);
                            if (rows > 10) { // (dbCache always loaded with the first 10 rows)
                                BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
                                if (null != documentDb.findOne(queryOLD, queryFields)) { // it is actually present
                                    continue;
                                }
                            }
                            DocumentPojo doc = new DocumentPojo();
                            doc.setId(id);
                            doc.setIndex(hit.getIndex() + "/document_index");
                            docs_to_remove.add(doc);
                            logger.info(
                                    "db sync removing doc: " + idStr + "/" + source_index + " not found in mongo");
                            oldfixes++;
                            fixcount++;
                        }
                    }
                    if (!docs_to_remove.isEmpty()) {
                        storeManager.removeFromSearch(docs_to_remove);
                    }

                    if (oldfixes != rows)
                        removedAll = false;
                }
                source_index++;
            } // end loop over sources
        } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
        }
        return fixcount;
    }//TESTED (unchanged from "tested" Beta version)
}