Example usage for com.mongodb DBCollection find

List of usage examples for com.mongodb DBCollection find

Introduction

In this page you can find the example usage for com.mongodb DBCollection find.

Prototype

public DBCursor find(@Nullable final DBObject query, final DBCollectionFindOptions options) 

Source Link

Document

Select documents in collection and get a cursor to the selected documents.

Usage

From source file:com.hangum.tadpole.mongodb.core.test.MongoTestNotEqualsStmt.java

License:Open Source License

/**
 * @param args//from w w w  .java 2s  .  co  m
 */
public static void main(String[] args) throws Exception {
    ConAndAuthentication testMongoCls = new ConAndAuthentication();
    Mongo mongo = testMongoCls.connection(ConAndAuthentication.serverurl, ConAndAuthentication.port);
    DB db = mongo.getDB("test");

    DBCollection myColl = db.getCollection("test_table");

    //      BasicDBObject myAndQuery = new BasicDBObject();
    //      myAndQuery.append("rental_id", new BasicDBObject("$ne", 1));

    BasicDBObject basicFields = new BasicDBObject();
    BasicDBObject basicWhere = new BasicDBObject();
    BasicDBObject basicSort = new BasicDBObject();

    DBCursor myCursor = myColl.find(basicFields, basicWhere).sort(basicSort).limit(999);
    while (myCursor.hasNext()) {
        System.out.println(myCursor.next());
    }

    mongo.close();
}

From source file:com.ikanow.infinit.e.api.knowledge.QueryHandler.java

License:Open Source License

private DBCursor getDocIds(DBCollection docDb, ObjectId[] ids, int nFromServerLimit,
        AdvancedQueryPojo.QueryOutputPojo output, AdvancedQueryPojo.QueryScorePojo score) {
    DBCursor docdCursor = null;// w w  w .jav  a2  s .c  o  m
    try {

        BasicDBObject query = new BasicDBObject();
        query.put("_id", new BasicDBObject("$in", ids));
        BasicDBObject fields = new BasicDBObject(DocumentPojo.fullText_, 0); // (used to discard community ids -plus legacy versions-, now need it)
        if (!output.docs.metadata) {
            fields.put(DocumentPojo.metadata_, 0);
        }
        boolean bNotAggEnts = ((output.aggregation == null) || (output.aggregation.entsNumReturn == null)
                || (output.aggregation.entsNumReturn == 0));
        if (bNotAggEnts && (null != score) && (null != score.sigWeight) && (score.sigWeight > 0.0)) {
            bNotAggEnts = false; // (special case, use agg entities to score docs)
        }
        if (!output.docs.ents && bNotAggEnts) {
            fields.put(DocumentPojo.entities_, 0);
        }
        boolean bNotAggEvents = ((output.aggregation == null) || (output.aggregation.eventsNumReturn == null)
                || (output.aggregation.eventsNumReturn == 0));
        boolean bNotAggFacts = ((output.aggregation == null) || (output.aggregation.factsNumReturn == null)
                || (output.aggregation.factsNumReturn == 0));
        boolean bNoStandaloneEvents = (null == output.docs.eventsTimeline)
                || (null == output.docs.numEventsTimelineReturn) || (output.docs.numEventsTimelineReturn == 0);
        if (!output.docs.events && !output.docs.facts && !output.docs.summaries && bNoStandaloneEvents
                && bNotAggEvents && bNotAggFacts) {
            fields.put(DocumentPojo.associations_, 0);
        }
        //TESTED

        //cm = new CollectionManager();
        boolean bPrimary = true;

        if (_replicaSetDistributionRatio > 0) {
            if (0 != (new Date().getTime() % _replicaSetDistributionRatio)) {
                bPrimary = false;
            }
        }
        if (bPrimary) { // Get from the primary
            docdCursor = docDb.find(query, fields).batchSize(nFromServerLimit);
        } else { // Try and get from the secondary if possible
            docdCursor = docDb.find(query, fields).batchSize(nFromServerLimit)
                    .setReadPreference(ReadPreference.secondaryPreferred());
        }

    } catch (Exception e) {
        // If an exception occurs log the error
        _logger.error("Address Exception Message: " + e.getMessage(), e);
    }
    return docdCursor;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

/**
 * Tests to see if duplicates might exist.
 * If it is not a duplicate, true is returned. If it is a duplicate,
 * the modified date is then checked to see if the file has been updated.
 * True is returned if the file has been updated, false otherwise.
 * //from  w ww  . ja  v a  2  s .  co  m
 * @param collection
 * @param modifiedDate
 * @param url
 * @param title
 * @return boolean (true/false)
 */
public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                    if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other...
                        //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything
                        //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source)
                        Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo
                                .get(DocumentPojo.created_);
                        if ((null != source.getHarvestStatus())
                                && (null != source.getHarvestStatus().getHarvested()
                                        && (null != mostRecentlyModifedFile_createdTime))) {
                            if (mostRecentlyModifedFile_createdTime
                                    .after(source.getHarvestStatus().getHarvested())) {
                                _mostRecentlyModifiedFile = null;
                                _mostRecentlyModifiedDocId = null;
                            }
                        } else { // If we don't have a date then force a "slow" dedup
                            _mostRecentlyModifiedFile = null;
                            _mostRecentlyModifiedDocId = null;
                        }
                    } //TESTED
                } //(found docs)
            } //(success mode)
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)         
    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED
    else if (null == sourceUrl) {
        return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting)
    }

    // No short cut, go the long way round:      

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.sourceUrl_, sourceUrl);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2);
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).hint(hint).limit(1);
    // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified)
    //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky

    if (!dbc.hasNext()) { //if there is no record, return true
        ret = true;
        modifiedDate.setTime(0);
    } else { // (all docs should have same modified, though this is ~ time ordered anyway)

        BasicDBObject dbo = (BasicDBObject) dbc.iterator().next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl
        // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where)
        // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs
        //  once an RPM >=5955 is deployed this will no longer be necessary)
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {/* ww w  . ja v  a2  s. c o m*/
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                if (null != source.getDistributionFactor()) { // if distributed, then apply extra term
                    if ((null != source.getHarvestStatus())
                            && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) {
                        Date d = source.getHarvestStatus().getDistributedLastCompletedCycle();
                        mostRecentQuery.put(DocumentPojo._id_,
                                new BasicDBObject(DbManager.lte_, new ObjectId(d)));
                    }
                } //TESTED

                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                } //TESTED (found docs)

                //DEBUG
                //if (null != _mostRecentlyModifiedDocId)
                //   System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime()));

            } //(success mode)            
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)

    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...         
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED

    if (null == url) { // use this call with url==null to just check the modified file...
        return true;
    }

    // No short cut, go the long way round:

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, url);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1)
    boolean foundMatch = dbc.hasNext();

    if (!foundMatch) { //if there is no record, return true
        ret = true;
    } else {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match
            if (!dbc.hasNext()) { // 1 matching doc, different modified times so update
                ret = true;
            } //TESTED
            else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur)
                // (slightly slow but should be OK because not going to happen very often)               
                int nCount = dbc.count();
                query.put(DocumentPojo.modified_, modifiedDate);
                ret = !(collection.find(query).limit(1).count() == nCount);
            } //TOTEST (shouldn't ever occur)         
        } else { // Doc has same modified time so don't update
            ret = false;
        } //TESTED
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

private LinkedList<String> getCandidateDuplicates(BasicDBObject query, String parentSourceKey,
        boolean bUpdate) {
    _modifiedTimeOfActualDuplicate = null;
    _duplicateId = null;/* w w w  . ja va  2 s  .c  om*/
    LinkedList<String> returnVal = new LinkedList<String>();

    DBCollection collection = DbManager.getDocument().getMetadata();
    BasicDBObject fields = new BasicDBObject(DocumentPojo.sourceKey_, 1);
    if (bUpdate) {
        fields.put(DocumentPojo.modified_, 1);
        fields.put(DocumentPojo.updateId_, 1);
    } //TESTED

    boolean bPrimary = true;
    if (_replicaSetDistributionRatio > 0) {
        // (distribute based on source key, should ensure some reasonable cache grouping...)
        if (0 != (parentSourceKey.hashCode() % _replicaSetDistributionRatio)) {
            bPrimary = false;
        }
    }
    DBCursor dbc = null;
    if (bPrimary) {
        dbc = collection.find(query, fields);
    } else {
        dbc = collection.find(query, fields).setReadPreference(ReadPreference.secondaryPreferred());
    }
    while (dbc.hasNext()) {
        DBObject dbo = dbc.next();
        String sourceKey = DocumentPojo.getSourceKey((String) dbo.get(DocumentPojo.sourceKey_));
        if (null != sourceKey) {

            // Check for exact duplicates, in which case can bypass horrible functional duplicate logic:
            boolean bFoundExactDuplicate = sourceKey.equals(parentSourceKey);
            // Update logic:
            if (bUpdate && bFoundExactDuplicate) {
                _modifiedTimeOfActualDuplicate = (Date) dbo.get(DocumentPojo.modified_);
                _duplicateId = (ObjectId) dbo.get(DocumentPojo.updateId_);
                if (null == _duplicateId) { // first time, use the _id
                    _duplicateId = (ObjectId) dbo.get(DocumentPojo._id_);
                }
            } //TESTED

            if (bFoundExactDuplicate) { // Found exact duplicate, so return just that for performance
                returnVal.clear();
            }
            returnVal.add(sourceKey);

            if (bFoundExactDuplicate) { // Found exact duplicate, we're done here
                return returnVal;
            }
        } //(if doc has source key, else is malformed, ignore)         
    } //(end loop over URL-duplicates)
    return returnVal;
}

From source file:com.ikanow.infinit.e.processing.custom.utils.CustomApiUtils.java

License:Apache License

public static void getJobResults(ResponsePojo rp, CustomMapReduceJobPojo cmr, int limit, String fields,
        String findStr, String sortStr, boolean bCsv) {

    BasicDBObject queryDbo = null;//ww  w . ja  v  a  2s  .  co m
    if (null != findStr) {
        queryDbo = (BasicDBObject) com.mongodb.util.JSON.parse(findStr);
    } else {
        queryDbo = new BasicDBObject();
    } //TOTEST

    BasicDBObject fieldsDbo = new BasicDBObject();
    if (null != fields) {
        fieldsDbo = (BasicDBObject) com.mongodb.util.JSON.parse("{" + fields + "}");
    }

    //return the results:

    // Need to handle sorting...
    BasicDBObject sort = null;
    if (null != sortStr) { //override
        sort = (BasicDBObject) com.mongodb.util.JSON.parse(sortStr);
    } else { //defaults
        String sortField = "_id";
        int sortDir = 1;
        BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse(
                InfiniteHadoopUtils.getQueryOrProcessing(cmr.query, InfiniteHadoopUtils.QuerySpec.POSTPROC));
        if (postProcObject != null) {
            sortField = postProcObject.getString("sortField", "_id");
            sortDir = postProcObject.getInt("sortDirection", 1);
        } //TESTED (post proc and no post proc)
        sort = new BasicDBObject(sortField, sortDir);
    } //TOTEST

    // Case 1: DB
    rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", true,
            "Map reduce job completed at: " + cmr.lastCompletionTime));
    if ((null == cmr.exportToHdfs) || !cmr.exportToHdfs) {
        DBCursor resultCursor = null;
        DBCollection coll = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection);
        DBDecoderFactory defaultDecoder = coll.getDBDecoderFactory();
        CsvGeneratingBsonDecoder csvDecoder = null;
        SizeReportingBasicBSONDecoder sizeDecoder = null;
        CustomMapReduceResultPojo cmrr = new CustomMapReduceResultPojo();
        try {
            if (bCsv) {
                coll.setDBDecoderFactory((csvDecoder = new CsvGeneratingBsonDecoder()));
            } else {
                coll.setDBDecoderFactory((sizeDecoder = new SizeReportingBasicBSONDecoder()));
            }
            if (limit > 0) {
                resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort).limit(limit);
            } else {
                resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort);
            }
            LinkedList<BasicDBObject> list = null;
            if (!bCsv) {
                list = new LinkedList<BasicDBObject>();
            }
            final int MAX_SIZE_CSV = 80 * 1024 * 1024; //(80MB)
            final int MAX_SIZE_JSON = 80 * 1024 * 1024; //(80MB)
            while (resultCursor.hasNext()) {
                BasicDBObject x = (BasicDBObject) resultCursor.next();
                if (!bCsv) {
                    list.add(x);
                }
                if (null != csvDecoder) {
                    if (csvDecoder.getCsv().length() > MAX_SIZE_CSV) {
                        break;
                    }
                } else if (null != sizeDecoder) {
                    if (sizeDecoder.getSize() > MAX_SIZE_JSON) {
                        break;
                    }
                }
            }
            cmrr.results = list;
        } finally {
            coll.setDBDecoderFactory(defaultDecoder);
        }
        cmrr.lastCompletionTime = cmr.lastCompletionTime;
        if (null != csvDecoder) {
            StringBuffer header = new StringBuffer();
            for (String field : csvDecoder.getOrderedFields()) {
                if (0 != header.length()) {
                    header.append(',');
                }
                header.append('"');
                header.append(field.replace("\"", "\\\""));
                header.append("\"");
            }
            header.append('\n');
            header.append(csvDecoder.getCsv().toString());
            cmrr.results = header.toString();
        }
        rp.setData(cmrr);
    } //TESTED
    else { // Case 2: HDFS

        if ((null != cmr.outputKey) && (null != cmr.outputValue)
                && cmr.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                && cmr.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
            // special case, text file
            try {
                rp.setData(HadoopUtils.getBsonFromTextFiles(cmr, limit, fields),
                        (BasePojoApiMap<BasicDBList>) null);
            } catch (Exception e) {
                rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false,
                        "Files don't appear to be in text file format, did you run the job before changing the output to Text/Text?"));
            }
        } //TESTED
        else { // sequence file
            try {
                rp.setData(HadoopUtils.getBsonFromSequenceFile(cmr, limit, fields),
                        (BasePojoApiMap<BasicDBList>) null);
            } catch (Exception e) {
                rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false,
                        "Files don't appear to be in sequence file format, did you run the job with Text/Text?"));
            }
        } //TESTED
    } //TESTED      
}

From source file:com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager.java

License:Open Source License

/**
 * Does the DB sync, pulls all mongo docs that occured from the
 * cleanseStartTime and source and makes sure they are in the search db.
 * /*w  ww . ja  v  a2s  . c  o  m*/
 * @param lastCleanse 1 hour before this harvester started
 * @param sources list of sources we are syncing
 * @return The number of errors fixed (docs deleted)
 */
// DON'T USE THIS UNTIL REWRITTEN - IT SHOULD TRANSFER DOCS ACROSS, NOT LEAVE THEM ALONE
@Deprecated
public int syncDB(long cleanseStartTime, Set<String> dbCache) {
    dbCache.clear();

    int fixcount = 0;
    DBCollection contentDb = DbManager.getDocument().getContent();
    DBCollection documentDb = DbManager.getDocument().getMetadata();
    StoreAndIndexManager storeManager = new StoreAndIndexManager();

    for (SourcePojo sp : sources) {
        // Don't combine the sources (apart from unusual multi-community case), because
        // that prevents you from using the compound sourceKey/_id index

        List<String> sourceKeyList = new ArrayList<String>();
        sourceKeyList.addAll(sp.getDistributedKeys());

        try {
            List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
            //FIRST DO ALL NEW FEEDS
            BasicDBObject query = new BasicDBObject();
            query.put(DocumentPojo._id_,
                    new BasicDBObject(MongoDbManager.gt_, new ObjectId((int) (cleanseStartTime / 1000), 0, 0))); // time aspect
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect
            BasicDBObject queryFields = new BasicDBObject();
            queryFields.append(DocumentPojo.url_, 1);
            queryFields.append(DocumentPojo.index_, 1);
            queryFields.append(DocumentPojo.sourceKey_, 1);

            DBCursor cur = documentDb.find(query, queryFields).batchSize(100);
            ElasticSearchManager esm = null;
            ElasticSearchManager esm_base = ElasticSearchManager.getIndex("document_index");
            String sIndex = null;

            while (cur.hasNext()) {
                if (bKillMeNow) {
                    return fixcount;
                }

                DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class);
                if (null != doc.getId()) {
                    dbCache.add(doc.getId().toString());
                }

                // Get index of doc to check in:
                String sNewIndex = doc.getIndex();
                if (null == sNewIndex) {
                    sIndex = null;
                    esm = esm_base;
                } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
                    sIndex = sNewIndex;
                    if (sNewIndex.equals("document_index")) {
                        esm = esm_base;
                    } else {
                        esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
                    }
                }

                //Compare mongo doc to search doc
                Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_);
                if (null == results || results.isEmpty()) {
                    //either too many entries (duplicates) or no entry
                    //delete this doc from both
                    logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey()
                            + " not found in search (or duplicate)");
                    docs_to_remove.add(doc);
                    documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
                    BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
                    contentQ.put(CompressedFullTextPojo.sourceKey_,
                            new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
                    contentDb.remove(contentQ);
                    fixcount++;
                }
            } //end loop over new docs for this source
            storeManager.removeFromSearch(docs_to_remove);

            //NOW VERIFY ALL OLD FEEDS
            int iteration = 1;
            boolean removedAll = true;
            docs_to_remove.clear();
            while (removedAll) {
                int rows = iteration * iteration * 10; //10x^2 exponentially check more docs
                int oldfixes = 0;
                BasicDBObject queryOLD = new BasicDBObject();
                queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect
                BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1);

                DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows);
                while (curOLD.hasNext()) {
                    DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class);
                    if (null != doc.getId()) {
                        dbCache.add(doc.getId().toString());
                    }

                    // Get index of doc to check in:
                    String sNewIndex = doc.getIndex();
                    if (null == sNewIndex) {
                        sIndex = null;
                        esm = esm_base;
                    } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
                        sIndex = sNewIndex;
                        if (sNewIndex.equals("document_index")) {
                            esm = esm_base;
                        } else {
                            esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
                        }
                    }

                    //Compare mongo doc to search doc
                    Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_);
                    if (null == results || results.isEmpty()) {
                        //either too many entries (duplicates) or no entry
                        //delete this doc from both
                        logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey()
                                + " not found in search (or duplicate)");
                        docs_to_remove.add(doc);
                        documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
                        contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl()));
                        fixcount++;
                        oldfixes++;
                    }
                }
                if (oldfixes != rows)
                    removedAll = false;
            } //(end loop over old docs for this source)
            storeManager.removeFromSearch(docs_to_remove);
        } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
        }
    }
    return fixcount;
}

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk)
        throws IOException {
    PropertiesManager pm = new PropertiesManager();
    int nMaxContentSize_bytes = pm.getMaxContentSize();

    // Initialize the DB:

    DBCollection docsDB = DbManager.getDocument().getMetadata();
    DBCollection contentDB = DbManager.getDocument().getContent();
    DBCollection sourcesDB = DbManager.getIngest().getSource();

    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // 1. Get the documents from the DB (combining data + metadata and refreshing source meta)

    // (Ignore soft-deleted records:)
    if (null == query) {
        query = new BasicDBObject();
    }//from   w  w w.j  av a 2 s  .  c o m
    Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException(
                    "Can't specify sourceKey as part of complex query term: " + query.toString());
        } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        if (sourceKeyQueryTerm instanceof String) {
            query.put(DocumentPojo.sourceKey_,
                    SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm));
        } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }")
        else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution
            BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
            fields.put(SourcePojo.highestDistributionFactorStored_, 1);
            DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields);
            LinkedList<String> sourceKeys = new LinkedList<String>();
            for (DBObject dbo : dbc) {
                String key = (String) dbo.get(SourcePojo.key_);
                Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor);
                sourceKeys.addAll(sourceKeysForSource);
            }
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
        } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }")
        else {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //(actually not possible, just included here for mathematical completeness...)         
    } else {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        // Optimize communityId into sourceKeys...
        if (null != query.get(DocumentPojo.communityId_)) {
            try {
                ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
                BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
                fields.put(SourcePojo.highestDistributionFactorStored_, 1);
                DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields);
                LinkedList<String> sourceKeys = new LinkedList<String>();
                int added = 0;
                for (DBObject dbo : dbc) {
                    String key = (String) dbo.get(SourcePojo.key_);
                    Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                    Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key,
                            distributionFactor);
                    sourceKeys.addAll(sourceKeysForSource);
                    added += sourceKeysForSource.size();
                }
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));

                System.out.println("(Optimized simple community query to " + added + " source key(s))");
            } catch (Exception e) {
                //DEBUG
                //e.printStackTrace();

                System.out.println("(Can't optimize complex community query: " + e.getMessage());
            }
        } //TESTED (by hand - including distributed source version)
    }
    // Ignored delete objects
    Object urlQuery = query.get(DocumentPojo.url_);
    if (null == urlQuery) {
        query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
    } //TESTED
    else if (urlQuery instanceof BasicDBObject) {
        ((BasicDBObject) urlQuery).append("$regex", "^[^?]");
    } //TESTED
      //DEBUG
      //System.out.println("COMBINED QUERY= " + query.toString());

    // If aggregating, kick off the background aggregation thread
    if (bAggregate) {
        EntityBackgroundAggregationManager.startThread();
        AssociationBackgroundAggregationManager.startThread();
    }

    //Debug:
    DBCursor dbc = null;
    dbc = docsDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    byte[] storageArray = new byte[200000];

    int nSynced = 0;
    LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
    Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
    ObjectId currCommunityId = null;
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        String sDocIndex = doc.getIndex();
        if (null == sDocIndex) {
            sDocIndex = "document_index";
        }
        if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
            _deletedIndex.add(sDocIndex);
            rebuildIndex(sDocIndex);
            try { // (Just in case the index requires some time to sort itself out)
                Thread.sleep(1000);
            } catch (InterruptedException e) {
            }
        }

        //Debug:
        //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());

        // Get the content:
        if ((0 != nMaxContentSize_bytes)
                && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_,
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
            fields.put(CompressedFullTextPojo.sourceKey_, 1);

            DBCursor dbcGzip = contentDB.find(contentQ, fields);
            while (dbcGzip.hasNext()) {
                BasicDBObject dboContent = (BasicDBObject) dbcGzip.next();
                if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) {
                    // If this has another version then ignore this one...
                    if (dbc.hasNext()) {
                        continue;
                    } //TESTED (by hand)               
                }

                byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = 0;
                StringBuffer output = new StringBuffer();
                while (nRead >= 0) {
                    nRead = gzip.read(storageArray, 0, 200000);
                    if (nRead > 0) {
                        String s = new String(storageArray, 0, nRead, "UTF-8");
                        output.append(s);
                    }
                }
                doc.setFullText(output.toString());
            }
        }
        // (else document has full text already)

        // Get tags, if necessary:
        // Always overwrite tags - one of the reasons we might choose to migrate
        // Also may need source in order to support source index filtering
        SourcePojo src = _sourceCache.get(doc.getSourceKey());
        if (null == src) {
            //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
            BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                    .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
            if (null != srcDbo) {
                src = SourcePojo.fromDb(srcDbo, SourcePojo.class);

                if (null != src.getProcessingPipeline()) {
                    try {
                        // Set the index settings
                        HarvestController hc = new HarvestController();
                        HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
                        hcPipe.extractSource_preProcessingPipeline(src, hc);
                    } catch (Exception e) {
                        //DEBUG
                        e.printStackTrace();
                    }
                } //TESTED (by hand)

                _sourceCache.put(doc.getSourceKey(), src);
            }
        }
        doc.setTempSource(src); // (needed for source index filtering)
        if (null != src) {
            if (null != src.getTags()) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();
                    tagsTidied.add(ss);
                }

                // May also want to write this back to the DB:
                //TODO (INF-2223): Handle append tags or not in the pipeline...
                if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
                    if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
                        BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                                doc.getRawSourceKey()); // (ie including the # if there is one)
                        updateQuery.put(DocumentPojo._id_, doc.getId());
                        docsDB.update(updateQuery,
                                new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_,
                                        new BasicDBObject(DbManager.each_, tagsTidied))));
                    }
                    doc.setTags(tagsTidied); // (just copy ptr across)
                }
            }
        }

        // 2. Update the index with the new document            

        // (Optionally also update entity and assoc features)

        if (bAggregate) {
            if (null == currCommunityId) {
                currCommunityId = doc.getCommunityId();
            } else if (!currCommunityId.equals(doc.getCommunityId())) {
                LinkedList<DocumentPojo> perCommunityDocList = null;
                if (null == communityList) { // (very first time we see > 1 community)
                    communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                currCommunityId = doc.getCommunityId();
                perCommunityDocList = communityList.get(currCommunityId);
                if (null == perCommunityDocList) {
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                perCommunityDocList.add(doc);
            }
        } //TESTED

        nSynced++;
        docsToTransfer.add(doc);
        if (0 == (nSynced % 10000)) {
            StoreAndIndexManager manager = new StoreAndIndexManager();

            if (bAggregate) {
                // Loop over communities and aggregate each one then store the modified entities/assocs               
                doAggregation(communityList, docsToTransfer);
                communityList = null; // (in case the next 10,000 docs are all in the same community!)
                currCommunityId = null;

            } //TOTEST            

            manager.addToSearch(docsToTransfer);
            docsToTransfer.clear();
            System.out.println("(Synced " + nSynced + " records)");
        }

    } // (End loop over docs)

    // Sync remaining docs

    if (!docsToTransfer.isEmpty()) {
        if (bAggregate) {
            // Loop over communities and aggregate each one then store the modified entities/assocs               
            doAggregation(communityList, docsToTransfer);
        }

        StoreAndIndexManager manager = new StoreAndIndexManager();
        manager.addToSearch(docsToTransfer);
    }

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }

    if (bAggregate) {
        System.out.println("Completed. You can hit CTRL+C at any time.");
        System.out.println(
                "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
        try {
            Thread.sleep(300000);
        } catch (InterruptedException e) {
        }

        // Turn off so we can exit
        EntityBackgroundAggregationManager.stopThreadAndWait();
        AssociationBackgroundAggregationManager.stopThreadAndWait();
    }
}

From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java

License:Apache License

static void doDelete(BasicDBObject query, int nLimit, boolean automatedRequest) {
    try {/*from  w w  w.  j  a  v a  2  s. c  o m*/
        // Initialize the DB:   
        DBCollection entityFeatureDB = DbManager.getFeature().getEntity();
        ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("entity_index");

        BasicDBObject fields = new BasicDBObject();
        fields.put(EntityFeaturePojo.index_, 1);
        fields.put(EntityFeaturePojo.communityId_, 1);

        DBCursor cur = entityFeatureDB.find(query, fields).limit(nLimit);
        // (this internally works in batches of 1000)
        if (automatedRequest) {
            System.out.println("Found " + cur.count() + " records to delete from _id list");
        } else {
            System.out.println("Found " + cur.count() + " records to delete from " + query.toString());
        }
        if (nLimit > 0) {
            System.out.println("(limited to " + nLimit + " records)");
        }
        int nArraySize = (cur.count() > 1000) ? 1000 : cur.count();
        ArrayList<EntityFeaturePojo> batchList = new ArrayList<EntityFeaturePojo>(nArraySize);

        while (cur.hasNext()) {
            EntityFeaturePojo gp = EntityFeaturePojo.fromDb(cur.next(), EntityFeaturePojo.class);
            batchList.add(gp);
            if (batchList.size() >= nArraySize) {
                internalDelete(batchList, elasticManager);
                batchList.clear();
            }
        }
        if (!batchList.isEmpty()) {
            internalDelete(batchList, elasticManager);
        }
        entityFeatureDB.remove(query);

    } catch (NumberFormatException e) {
        e.printStackTrace();
    } catch (MongoException e) {
        e.printStackTrace();
    } finally {
    }

}

From source file:com.impetus.client.mongodb.DefaultMongoDBDataHandler.java

License:Apache License

/**
 * Retrieves A collection of embedded object within a document that match a
 * criteria specified in <code>query</code> TODO: This code requires a
 * serious overhawl. Currently it assumes that user query is in the form
 * "Select alias.columnName from EntityName alias". However, correct query
 * to be supported is//from w w w.j a  v a  2 s  .  com
 * "Select alias.superColumnName.columnName from EntityName alias"
 * 
 * @param dbCollection
 *            the db collection
 * @param m
 *            the m
 * @param documentName
 *            the document name
 * @param mongoQuery
 *            the mongo query
 * @param result
 *            the result
 * @param orderBy
 *            the order by
 * @param maxResult
 * @return the embedded object list
 * @throws PropertyAccessException
 *             the property access exception
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
public List getEmbeddedObjectList(DBCollection dbCollection, EntityMetadata m, String documentName,
        BasicDBObject mongoQuery, String result, BasicDBObject orderBy, int maxResult, int firstResult,
        BasicDBObject keys, final KunderaMetadata kunderaMetadata)
        throws PropertyAccessException, InstantiationException, IllegalAccessException {
    List list = new ArrayList();// List of embedded object to be returned

    // Specified after entity alias in query
    String columnName = result;

    // Something user didn't specify and we have to derive
    // TODO: User must specify this in query and remove this logic once
    // query format is changed

    String enclosingDocumentName = null;

    MetamodelImpl metaModel = (MetamodelImpl) kunderaMetadata.getApplicationMetadata()
            .getMetamodel(m.getPersistenceUnit());
    EntityType entityType = metaModel.entity(m.getEntityClazz());
    EmbeddableType superColumn = null;
    Set<Attribute> columns = null;
    Attribute attrib = null;
    try {
        attrib = entityType.getAttribute(columnName);
        Map<String, EmbeddableType> embeddables = metaModel.getEmbeddables(m.getEntityClazz());
        for (String key : embeddables.keySet()) {
            superColumn = embeddables.get(key);
            columns = superColumn.getAttributes();

            for (Attribute column : columns) {
                if (((AbstractAttribute) column).getJPAColumnName().equals(columnName)) {
                    enclosingDocumentName = key;
                    break;
                }
            }
        }
    } catch (IllegalArgumentException iax) {
        if (log.isWarnEnabled()) {
            log.warn("No column found for: " + columnName);
        }
    }

    // Query for fetching entities based on user specified criteria
    DBCursor cursor = orderBy != null ? dbCollection.find(mongoQuery, keys).sort(orderBy)
            : dbCollection.find(mongoQuery, keys).limit(maxResult).skip(firstResult);

    if (superColumn != null) {
        Field superColumnField = (Field) attrib.getJavaMember();
        while (cursor.hasNext()) {
            DBObject fetchedDocument = cursor.next();
            Object embeddedDocumentObject = fetchedDocument.get(superColumnField.getName());

            if (embeddedDocumentObject != null) {
                if (embeddedDocumentObject instanceof BasicDBList) {
                    Class embeddedObjectClass = PropertyAccessorHelper.getGenericClass(superColumnField);
                    for (Object dbObj : (BasicDBList) embeddedDocumentObject) {
                        Object obj = embeddedObjectClass.newInstance();
                        Object embeddedObject = new DocumentObjectMapper().getObjectFromDocument(metaModel,
                                (BasicDBObject) dbObj, superColumn.getAttributes(), obj);
                        Object fieldValue = PropertyAccessorHelper.getObject(embeddedObject, columnName);
                    }
                } else if (embeddedDocumentObject instanceof BasicDBObject) {
                    Object obj = superColumn.getJavaType().newInstance();
                    Object embeddedObject = DocumentObjectMapper.getObjectFromDocument(metaModel,
                            (BasicDBObject) embeddedDocumentObject, superColumn.getAttributes(), obj);
                    list.add(embeddedObject);
                } else {
                    throw new PersistenceException("Can't retrieve embedded object from MONGODB document coz "
                            + "it wasn't stored as BasicDBObject, possible problem in format.");
                }
            }
        }
    }
    return list;
}