Example usage for com.mongodb DBCollection find

Introduction

In this page you can find the example usage for com.mongodb DBCollection find.

Prototype

public DBCursor find(@Nullable final DBObject query, final DBCollectionFindOptions options)

Source Link

Document

Select documents in collection and get a cursor to the selected documents.

Usage

From source file:com.hangum.tadpole.mongodb.core.test.MongoTestNotEqualsStmt.java

License:Open Source License

/**
 * @param args//from w w w  .java 2s  .  co  m
 */
public static void main(String[] args) throws Exception {
    ConAndAuthentication testMongoCls = new ConAndAuthentication();
    Mongo mongo = testMongoCls.connection(ConAndAuthentication.serverurl, ConAndAuthentication.port);
    DB db = mongo.getDB("test");

    DBCollection myColl = db.getCollection("test_table");

    //      BasicDBObject myAndQuery = new BasicDBObject();
    //      myAndQuery.append("rental_id", new BasicDBObject("$ne", 1));

    BasicDBObject basicFields = new BasicDBObject();
    BasicDBObject basicWhere = new BasicDBObject();
    BasicDBObject basicSort = new BasicDBObject();

    DBCursor myCursor = myColl.find(basicFields, basicWhere).sort(basicSort).limit(999);
    while (myCursor.hasNext()) {
        System.out.println(myCursor.next());
    }

    mongo.close();
}

From source file:com.ikanow.infinit.e.api.knowledge.QueryHandler.java

License:Open Source License

private DBCursor getDocIds(DBCollection docDb, ObjectId[] ids, int nFromServerLimit,
        AdvancedQueryPojo.QueryOutputPojo output, AdvancedQueryPojo.QueryScorePojo score) {
    DBCursor docdCursor = null;// w w  w .jav  a2  s .c  o  m
    try {

        BasicDBObject query = new BasicDBObject();
        query.put("_id", new BasicDBObject("$in", ids));
        BasicDBObject fields = new BasicDBObject(DocumentPojo.fullText_, 0); // (used to discard community ids -plus legacy versions-, now need it)
        if (!output.docs.metadata) {
            fields.put(DocumentPojo.metadata_, 0);
        }
        boolean bNotAggEnts = ((output.aggregation == null) || (output.aggregation.entsNumReturn == null)
                || (output.aggregation.entsNumReturn == 0));
        if (bNotAggEnts && (null != score) && (null != score.sigWeight) && (score.sigWeight > 0.0)) {
            bNotAggEnts = false; // (special case, use agg entities to score docs)
        }
        if (!output.docs.ents && bNotAggEnts) {
            fields.put(DocumentPojo.entities_, 0);
        }
        boolean bNotAggEvents = ((output.aggregation == null) || (output.aggregation.eventsNumReturn == null)
                || (output.aggregation.eventsNumReturn == 0));
        boolean bNotAggFacts = ((output.aggregation == null) || (output.aggregation.factsNumReturn == null)
                || (output.aggregation.factsNumReturn == 0));
        boolean bNoStandaloneEvents = (null == output.docs.eventsTimeline)
                || (null == output.docs.numEventsTimelineReturn) || (output.docs.numEventsTimelineReturn == 0);
        if (!output.docs.events && !output.docs.facts && !output.docs.summaries && bNoStandaloneEvents
                && bNotAggEvents && bNotAggFacts) {
            fields.put(DocumentPojo.associations_, 0);
        }
        //TESTED

        //cm = new CollectionManager();
        boolean bPrimary = true;

        if (_replicaSetDistributionRatio > 0) {
            if (0 != (new Date().getTime() % _replicaSetDistributionRatio)) {
                bPrimary = false;
            }
        }
        if (bPrimary) { // Get from the primary
            docdCursor = docDb.find(query, fields).batchSize(nFromServerLimit);
        } else { // Try and get from the secondary if possible
            docdCursor = docDb.find(query, fields).batchSize(nFromServerLimit)
                    .setReadPreference(ReadPreference.secondaryPreferred());
        }

    } catch (Exception e) {
        // If an exception occurs log the error
        _logger.error("Address Exception Message: " + e.getMessage(), e);
    }
    return docdCursor;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

/**
 * Tests to see if duplicates might exist.
 * If it is not a duplicate, true is returned. If it is a duplicate,
 * the modified date is then checked to see if the file has been updated.
 * True is returned if the file has been updated, false otherwise.
 * //from  w ww  . ja  v a  2  s .  co  m
 * @param collection
 * @param modifiedDate
 * @param url
 * @param title
 * @return boolean (true/false)
 */
public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                    if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other...
                        //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything
                        //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source)
                        Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo
                                .get(DocumentPojo.created_);
                        if ((null != source.getHarvestStatus())
                                && (null != source.getHarvestStatus().getHarvested()
                                        && (null != mostRecentlyModifedFile_createdTime))) {
                            if (mostRecentlyModifedFile_createdTime
                                    .after(source.getHarvestStatus().getHarvested())) {
                                _mostRecentlyModifiedFile = null;
                                _mostRecentlyModifiedDocId = null;
                            }
                        } else { // If we don't have a date then force a "slow" dedup
                            _mostRecentlyModifiedFile = null;
                            _mostRecentlyModifiedDocId = null;
                        }
                    } //TESTED
                } //(found docs)
            } //(success mode)
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)         
    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED
    else if (null == sourceUrl) {
        return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting)
    }

    // No short cut, go the long way round:      

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.sourceUrl_, sourceUrl);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2);
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).hint(hint).limit(1);
    // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified)
    //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky

    if (!dbc.hasNext()) { //if there is no record, return true
        ret = true;
        modifiedDate.setTime(0);
    } else { // (all docs should have same modified, though this is ~ time ordered anyway)

        BasicDBObject dbo = (BasicDBObject) dbc.iterator().next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl
        // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where)
        // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs
        //  once an RPM >=5955 is deployed this will no longer be necessary)
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {/* ww w  . ja v  a2  s. c o m*/
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                if (null != source.getDistributionFactor()) { // if distributed, then apply extra term
                    if ((null != source.getHarvestStatus())
                            && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) {
                        Date d = source.getHarvestStatus().getDistributedLastCompletedCycle();
                        mostRecentQuery.put(DocumentPojo._id_,
                                new BasicDBObject(DbManager.lte_, new ObjectId(d)));
                    }
                } //TESTED

                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                } //TESTED (found docs)

                //DEBUG
                //if (null != _mostRecentlyModifiedDocId)
                //   System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime()));

            } //(success mode)            
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)

    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...         
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED

    if (null == url) { // use this call with url==null to just check the modified file...
        return true;
    }

    // No short cut, go the long way round:

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, url);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1)
    boolean foundMatch = dbc.hasNext();

    if (!foundMatch) { //if there is no record, return true
        ret = true;
    } else {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match
            if (!dbc.hasNext()) { // 1 matching doc, different modified times so update
                ret = true;
            } //TESTED
            else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur)
                // (slightly slow but should be OK because not going to happen very often)               
                int nCount = dbc.count();
                query.put(DocumentPojo.modified_, modifiedDate);
                ret = !(collection.find(query).limit(1).count() == nCount);
            } //TOTEST (shouldn't ever occur)         
        } else { // Doc has same modified time so don't update
            ret = false;
        } //TESTED
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

private LinkedList<String> getCandidateDuplicates(BasicDBObject query, String parentSourceKey,
        boolean bUpdate) {
    _modifiedTimeOfActualDuplicate = null;
    _duplicateId = null;/* w w w  . ja va  2 s  .c  om*/
    LinkedList<String> returnVal = new LinkedList<String>();

    DBCollection collection = DbManager.getDocument().getMetadata();
    BasicDBObject fields = new BasicDBObject(DocumentPojo.sourceKey_, 1);
    if (bUpdate) {
        fields.put(DocumentPojo.modified_, 1);
        fields.put(DocumentPojo.updateId_, 1);
    } //TESTED

    boolean bPrimary = true;
    if (_replicaSetDistributionRatio > 0) {
        // (distribute based on source key, should ensure some reasonable cache grouping...)
        if (0 != (parentSourceKey.hashCode() % _replicaSetDistributionRatio)) {
            bPrimary = false;
        }
    }
    DBCursor dbc = null;
    if (bPrimary) {
        dbc = collection.find(query, fields);
    } else {
        dbc = collection.find(query, fields).setReadPreference(ReadPreference.secondaryPreferred());
    }
    while (dbc.hasNext()) {
        DBObject dbo = dbc.next();
        String sourceKey = DocumentPojo.getSourceKey((String) dbo.get(DocumentPojo.sourceKey_));
        if (null != sourceKey) {

            // Check for exact duplicates, in which case can bypass horrible functional duplicate logic:
            boolean bFoundExactDuplicate = sourceKey.equals(parentSourceKey);
            // Update logic:
            if (bUpdate && bFoundExactDuplicate) {
                _modifiedTimeOfActualDuplicate = (Date) dbo.get(DocumentPojo.modified_);
                _duplicateId = (ObjectId) dbo.get(DocumentPojo.updateId_);
                if (null == _duplicateId) { // first time, use the _id
                    _duplicateId = (ObjectId) dbo.get(DocumentPojo._id_);
                }
            } //TESTED

            if (bFoundExactDuplicate) { // Found exact duplicate, so return just that for performance
                returnVal.clear();
            }
            returnVal.add(sourceKey);

            if (bFoundExactDuplicate) { // Found exact duplicate, we're done here
                return returnVal;
            }
        } //(if doc has source key, else is malformed, ignore)         
    } //(end loop over URL-duplicates)
    return returnVal;
}

From source file:com.ikanow.infinit.e.processing.custom.utils.CustomApiUtils.java

License:Apache License

public static void getJobResults(ResponsePojo rp, CustomMapReduceJobPojo cmr, int limit, String fields,
        String findStr, String sortStr, boolean bCsv) {

    BasicDBObject queryDbo = null;//ww  w . ja  v  a  2s  .  co m
    if (null != findStr) {
        queryDbo = (BasicDBObject) com.mongodb.util.JSON.parse(findStr);
    } else {
        queryDbo = new BasicDBObject();
    } //TOTEST

    BasicDBObject fieldsDbo = new BasicDBObject();
    if (null != fields) {
        fieldsDbo = (BasicDBObject) com.mongodb.util.JSON.parse("{" + fields + "}");
    }

    //return the results:

    // Need to handle sorting...
    BasicDBObject sort = null;
    if (null != sortStr) { //override
        sort = (BasicDBObject) com.mongodb.util.JSON.parse(sortStr);
    } else { //defaults
        String sortField = "_id";
        int sortDir = 1;
        BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse(
                InfiniteHadoopUtils.getQueryOrProcessing(cmr.query, InfiniteHadoopUtils.QuerySpec.POSTPROC));
        if (postProcObject != null) {
            sortField = postProcObject.getString("sortField", "_id");
            sortDir = postProcObject.getInt("sortDirection", 1);
        } //TESTED (post proc and no post proc)
        sort = new BasicDBObject(sortField, sortDir);
    } //TOTEST

    // Case 1: DB
    rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", true,
            "Map reduce job completed at: " + cmr.lastCompletionTime));
    if ((null == cmr.exportToHdfs) || !cmr.exportToHdfs) {
        DBCursor resultCursor = null;
        DBCollection coll = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection);
        DBDecoderFactory defaultDecoder = coll.getDBDecoderFactory();
        CsvGeneratingBsonDecoder csvDecoder = null;
        SizeReportingBasicBSONDecoder sizeDecoder = null;
        CustomMapReduceResultPojo cmrr = new CustomMapReduceResultPojo();
        try {
            if (bCsv) {
                coll.setDBDecoderFactory((csvDecoder = new CsvGeneratingBsonDecoder()));
            } else {
                coll.setDBDecoderFactory((sizeDecoder = new SizeReportingBasicBSONDecoder()));
            }
            if (limit > 0) {
                resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort).limit(limit);
            } else {
                resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort);
            }
            LinkedList<BasicDBObject> list = null;
            if (!bCsv) {
                list = new LinkedList<BasicDBObject>();
            }
            final int MAX_SIZE_CSV = 80 * 1024 * 1024; //(80MB)
            final int MAX_SIZE_JSON = 80 * 1024 * 1024; //(80MB)
            while (resultCursor.hasNext()) {
                BasicDBObject x = (BasicDBObject) resultCursor.next();
                if (!bCsv) {
                    list.add(x);
                }
                if (null != csvDecoder) {
                    if (csvDecoder.getCsv().length() > MAX_SIZE_CSV) {
                        break;
                    }
                } else if (null != sizeDecoder) {
                    if (sizeDecoder.getSize() > MAX_SIZE_JSON) {
                        break;
                    }
                }
            }
            cmrr.results = list;
        } finally {
            coll.setDBDecoderFactory(defaultDecoder);
        }
        cmrr.lastCompletionTime = cmr.lastCompletionTime;
        if (null != csvDecoder) {
            StringBuffer header = new StringBuffer();
            for (String field : csvDecoder.getOrderedFields()) {
                if (0 != header.length()) {
                    header.append(',');
                }
                header.append('"');
                header.append(field.replace("\"", "\\\""));
                header.append("\"");
            }
            header.append('\n');
            header.append(csvDecoder.getCsv().toString());
            cmrr.results = header.toString();
        }
        rp.setData(cmrr);
    } //TESTED
    else { // Case 2: HDFS

        if ((null != cmr.outputKey) && (null != cmr.outputValue)
                && cmr.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                && cmr.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
            // special case, text file
            try {
                rp.setData(HadoopUtils.getBsonFromTextFiles(cmr, limit, fields),
                        (BasePojoApiMap<BasicDBList>) null);
            } catch (Exception e) {
                rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false,
                        "Files don't appear to be in text file format, did you run the job before changing the output to Text/Text?"));
            }
        } //TESTED
        else { // sequence file
            try {
                rp.setData(HadoopUtils.getBsonFromSequenceFile(cmr, limit, fields),
                        (BasePojoApiMap<BasicDBList>) null);
            } catch (Exception e) {
                rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false,
                        "Files don't appear to be in sequence file format, did you run the job with Text/Text?"));
            }
        } //TESTED
    } //TESTED      
}

From source file:com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager.java

License:Open Source License

/**
 * Does the DB sync, pulls all mongo docs that occured from the
 * cleanseStartTime and source and makes sure they are in the search db.
 * /*w  ww . ja  v  a2s  . c  o  m*/
 * @param lastCleanse 1 hour before this harvester started
 * @param sources list of sources we are syncing
 * @return The number of errors fixed (docs deleted)
 */
// DON'T USE THIS UNTIL REWRITTEN - IT SHOULD TRANSFER DOCS ACROSS, NOT LEAVE THEM ALONE
@Deprecated
public int syncDB(long cleanseStartTime, Set<String> dbCache) {
    dbCache.clear();

    int fixcount = 0;
    DBCollection contentDb = DbManager.getDocument().getContent();
    DBCollection documentDb = DbManager.getDocument().getMetadata();
    StoreAndIndexManager storeManager = new StoreAndIndexManager();

    for (SourcePojo sp : sources) {
        // Don't combine the sources (apart from unusual multi-community case), because
        // that prevents you from using the compound sourceKey/_id index

        List<String> sourceKeyList = new ArrayList<String>();
        sourceKeyList.addAll(sp.getDistributedKeys());

        try {
            List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
            //FIRST DO ALL NEW FEEDS
            BasicDBObject query = new BasicDBObject();
            query.put(DocumentPojo._id_,
                    new BasicDBObject(MongoDbManager.gt_, new ObjectId((int) (cleanseStartTime / 1000), 0, 0))); // time aspect
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect
            BasicDBObject queryFields = new BasicDBObject();
            queryFields.append(DocumentPojo.url_, 1);
            queryFields.append(DocumentPojo.index_, 1);
            queryFields.append(DocumentPojo.sourceKey_, 1);

            DBCursor cur = documentDb.find(query, queryFields).batchSize(100);
            ElasticSearchManager esm = null;
            ElasticSearchManager esm_base = ElasticSearchManager.getIndex("document_index");
            String sIndex = null;

            while (cur.hasNext()) {
                if (bKillMeNow) {
                    return fixcount;
                }

                DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class);
                if (null != doc.getId()) {
                    dbCache.add(doc.getId().toString());
                }

                // Get index of doc to check in:
                String sNewIndex = doc.getIndex();
                if (null == sNewIndex) {
                    sIndex = null;
                    esm = esm_base;
                } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
                    sIndex = sNewIndex;
                    if (sNewIndex.equals("document_index")) {
                        esm = esm_base;
                    } else {
                        esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
                    }
                }

                //Compare mongo doc to search doc
                Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_);
                if (null == results || results.isEmpty()) {
                    //either too many entries (duplicates) or no entry
                    //delete this doc from both
                    logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey()
                            + " not found in search (or duplicate)");
                    docs_to_remove.add(doc);
                    documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
                    BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
                    contentQ.put(CompressedFullTextPojo.sourceKey_,
                            new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
                    contentDb.remove(contentQ);
                    fixcount++;
                }
            } //end loop over new docs for this source
            storeManager.removeFromSearch(docs_to_remove);

            //NOW VERIFY ALL OLD FEEDS
            int iteration = 1;
            boolean removedAll = true;
            docs_to_remove.clear();
            while (removedAll) {
                int rows = iteration * iteration * 10; //10x^2 exponentially check more docs
                int oldfixes = 0;
                BasicDBObject queryOLD = new BasicDBObject();
                queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect
                BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1);

                DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows);
                while (curOLD.hasNext()) {
                    DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class);
                    if (null != doc.getId()) {
                        dbCache.add(doc.getId().toString());
                    }

                    // Get index of doc to check in:
                    String sNewIndex = doc.getIndex();
                    if (null == sNewIndex) {
                        sIndex = null;
                        esm = esm_base;
                    } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
                        sIndex = sNewIndex;
                        if (sNewIndex.equals("document_index")) {
                            esm = esm_base;
                        } else {
                            esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
                        }
                    }

                    //Compare mongo doc to search doc
                    Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_);
                    if (null == results || results.isEmpty()) {
                        //either too many entries (duplicates) or no entry
                        //delete this doc from both
                        logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey()
                                + " not found in search (or duplicate)");
                        docs_to_remove.add(doc);
                        documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
                        contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl()));
                        fixcount++;
                        oldfixes++;
                    }
                }
                if (oldfixes != rows)
                    removedAll = false;
            } //(end loop over old docs for this source)
            storeManager.removeFromSearch(docs_to_remove);
        } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
        }
    }
    return fixcount;
}

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk)
        throws IOException {
    PropertiesManager pm = new PropertiesManager();
    int nMaxContentSize_bytes = pm.getMaxContentSize();

    // Initialize the DB:

    DBCollection docsDB = DbManager.getDocument().getMetadata();
    DBCollection contentDB = DbManager.getDocument().getContent();
    DBCollection sourcesDB = DbManager.getIngest().getSource();

    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // 1. Get the documents from the DB (combining data + metadata and refreshing source meta)

    // (Ignore soft-deleted records:)
    if (null == query) {
        query = new BasicDBObject();
    }//from   w  w w.j  av a 2 s  .  c o m
    Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException(
                    "Can't specify sourceKey as part of complex query term: " + query.toString());
        } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        if (sourceKeyQueryTerm instanceof String) {
            query.put(DocumentPojo.sourceKey_,
                    SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm));
        } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }")
        else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution
            BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
            fields.put(SourcePojo.highestDistributionFactorStored_, 1);
            DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields);
            LinkedList<String> sourceKeys = new LinkedList<String>();
            for (DBObject dbo : dbc) {
                String key = (String) dbo.get(SourcePojo.key_);
                Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor);
                sourceKeys.addAll(sourceKeysForSource);
            }
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
        } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }")
        else {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //(actually not possible, just included here for mathematical completeness...)         
    } else {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        // Optimize communityId into sourceKeys...
        if (null != query.get(DocumentPojo.communityId_)) {
            try {
                ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
                BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
                fields.put(SourcePojo.highestDistributionFactorStored_, 1);
                DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields);
                LinkedList<String> sourceKeys = new LinkedList<String>();
                int added = 0;
                for (DBObject dbo : dbc) {
                    String key = (String) dbo.get(SourcePojo.key_);
                    Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                    Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key,
                            distributionFactor);
                    sourceKeys.addAll(sourceKeysForSource);
                    added += sourceKeysForSource.size();
                }
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));

                System.out.println("(Optimized simple community query to " + added + " source key(s))");
            } catch (Exception e) {
                //DEBUG
                //e.printStackTrace();

                System.out.println("(Can't optimize complex community query: " + e.getMessage());
            }
        } //TESTED (by hand - including distributed source version)
    }
    // Ignored delete objects
    Object urlQuery = query.get(DocumentPojo.url_);
    if (null == urlQuery) {
        query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
    } //TESTED
    else if (urlQuery instanceof BasicDBObject) {
        ((BasicDBObject) urlQuery).append("$regex", "^[^?]");
    } //TESTED
      //DEBUG
      //System.out.println("COMBINED QUERY= " + query.toString());

    // If aggregating, kick off the background aggregation thread
    if (bAggregate) {
        EntityBackgroundAggregationManager.startThread();
        AssociationBackgroundAggregationManager.startThread();
    }

    //Debug:
    DBCursor dbc = null;
    dbc = docsDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    byte[] storageArray = new byte[200000];

    int nSynced = 0;
    LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
    Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
    ObjectId currCommunityId = null;
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        String sDocIndex = doc.getIndex();
        if (null == sDocIndex) {
            sDocIndex = "document_index";
        }
        if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
            _deletedIndex.add(sDocIndex);
            rebuildIndex(sDocIndex);
            try { // (Just in case the index requires some time to sort itself out)
                Thread.sleep(1000);
            } catch (InterruptedException e) {
            }
        }

        //Debug:
        //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());

        // Get the content:
        if ((0 != nMaxContentSize_bytes)
                && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_,
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
            fields.put(CompressedFullTextPojo.sourceKey_, 1);

            DBCursor dbcGzip = contentDB.find(contentQ, fields);
            while (dbcGzip.hasNext()) {
                BasicDBObject dboContent = (BasicDBObject) dbcGzip.next();
                if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) {
                    // If this has another version then ignore this one...
                    if (dbc.hasNext()) {
                        continue;
                    } //TESTED (by hand)               
                }

                byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = 0;
                StringBuffer output = new StringBuffer();
                while (nRead >= 0) {
                    nRead = gzip.read(storageArray, 0, 200000);
                    if (nRead > 0) {
                        String s = new String(storageArray, 0, nRead, "UTF-8");
                        output.append(s);
                    }
                }
                doc.setFullText(output.toString());
            }
        }
        // (else document has full text already)

        // Get tags, if necessary:
        // Always overwrite tags - one of the reasons we might choose to migrate
        // Also may need source in order to support source index filtering
        SourcePojo src = _sourceCache.get(doc.getSourceKey());
        if (null == src) {
            //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
            BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                    .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
            if (null != srcDbo) {
                src = SourcePojo.fromDb(srcDbo, SourcePojo.class);

                if (null != src.getProcessingPipeline()) {
                    try {
                        // Set the index settings
                        HarvestController hc = new HarvestController();
                        HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
                        hcPipe.extractSource_preProcessingPipeline(src, hc);
                    } catch (Exception e) {
                        //DEBUG
                        e.printStackTrace();
                    }
                } //TESTED (by hand)

                _sourceCache.put(doc.getSourceKey(), src);
            }
        }
        doc.setTempSource(src); // (needed for source index filtering)
        if (null != src) {
            if (null != src.getTags()) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();
                    tagsTidied.add(ss);
                }

                // May also want to write this back to the DB:
                //TODO (INF-2223): Handle append tags or not in the pipeline...
                if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
                    if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
                        BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                                doc.getRawSourceKey()); // (ie including the # if there is one)
                        updateQuery.put(DocumentPojo._id_, doc.getId());
                        docsDB.update(updateQuery,
                                new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_,
                                        new BasicDBObject(DbManager.each_, tagsTidied))));
                    }
                    doc.setTags(tagsTidied); // (just copy ptr across)
                }
            }
        }

        // 2. Update the index with the new document            

        // (Optionally also update entity and assoc features)

        if (bAggregate) {
            if (null == currCommunityId) {
                currCommunityId = doc.getCommunityId();
            } else if (!currCommunityId.equals(doc.getCommunityId())) {
                LinkedList<DocumentPojo> perCommunityDocList = null;
                if (null == communityList) { // (very first time we see > 1 community)
                    communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                currCommunityId = doc.getCommunityId();
                perCommunityDocList = communityList.get(currCommunityId);
                if (null == perCommunityDocList) {
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                perCommunityDocList.add(doc);
            }
        } //TESTED

        nSynced++;
        docsToTransfer.add(doc);
        if (0 == (nSynced % 10000)) {
            StoreAndIndexManager manager = new StoreAndIndexManager();

            if (bAggregate) {
                // Loop over communities and aggregate each one then store the modified entities/assocs               
                doAggregation(communityList, docsToTransfer);
                communityList = null; // (in case the next 10,000 docs are all in the same community!)
                currCommunityId = null;

            } //TOTEST            

            manager.addToSearch(docsToTransfer);
            docsToTransfer.clear();
            System.out.println("(Synced " + nSynced + " records)");
        }

    } // (End loop over docs)

    // Sync remaining docs

    if (!docsToTransfer.isEmpty()) {
        if (bAggregate) {
            // Loop over communities and aggregate each one then store the modified entities/assocs               
            doAggregation(communityList, docsToTransfer);
        }

        StoreAndIndexManager manager = new StoreAndIndexManager();
        manager.addToSearch(docsToTransfer);
    }

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }

    if (bAggregate) {
        System.out.println("Completed. You can hit CTRL+C at any time.");
        System.out.println(
                "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
        try {
            Thread.sleep(300000);
        } catch (InterruptedException e) {
        }

        // Turn off so we can exit
        EntityBackgroundAggregationManager.stopThreadAndWait();
        AssociationBackgroundAggregationManager.stopThreadAndWait();
    }
}

From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java

License:Apache License

static void doDelete(BasicDBObject query, int nLimit, boolean automatedRequest) {
    try {/*from  w w  w.  j  a  v a  2  s. c  o m*/
        // Initialize the DB:   
        DBCollection entityFeatureDB = DbManager.getFeature().getEntity();
        ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("entity_index");

        BasicDBObject fields = new BasicDBObject();
        fields.put(EntityFeaturePojo.index_, 1);
        fields.put(EntityFeaturePojo.communityId_, 1);

        DBCursor cur = entityFeatureDB.find(query, fields).limit(nLimit);
        // (this internally works in batches of 1000)
        if (automatedRequest) {
            System.out.println("Found " + cur.count() + " records to delete from _id list");
        } else {
            System.out.println("Found " + cur.count() + " records to delete from " + query.toString());
        }
        if (nLimit > 0) {
            System.out.println("(limited to " + nLimit + " records)");
        }
        int nArraySize = (cur.count() > 1000) ? 1000 : cur.count();
        ArrayList<EntityFeaturePojo> batchList = new ArrayList<EntityFeaturePojo>(nArraySize);

        while (cur.hasNext()) {
            EntityFeaturePojo gp = EntityFeaturePojo.fromDb(cur.next(), EntityFeaturePojo.class);
            batchList.add(gp);
            if (batchList.size() >= nArraySize) {
                internalDelete(batchList, elasticManager);
                batchList.clear();
            }
        }
        if (!batchList.isEmpty()) {
            internalDelete(batchList, elasticManager);
        }
        entityFeatureDB.remove(query);

    } catch (NumberFormatException e) {
        e.printStackTrace();
    } catch (MongoException e) {
        e.printStackTrace();
    } finally {
    }

}

From source file:com.impetus.client.mongodb.DefaultMongoDBDataHandler.java

License:Apache License

/**
 * Retrieves A collection of embedded object within a document that match a
 * criteria specified in <code>query</code> TODO: This code requires a
 * serious overhawl. Currently it assumes that user query is in the form
 * "Select alias.columnName from EntityName alias". However, correct query
 * to be supported is//from w w w.j a  v a  2 s  .  com
 * "Select alias.superColumnName.columnName from EntityName alias"
 * 
 * @param dbCollection
 *            the db collection
 * @param m
 *            the m
 * @param documentName
 *            the document name
 * @param mongoQuery
 *            the mongo query
 * @param result
 *            the result
 * @param orderBy
 *            the order by
 * @param maxResult
 * @return the embedded object list
 * @throws PropertyAccessException
 *             the property access exception
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
public List getEmbeddedObjectList(DBCollection dbCollection, EntityMetadata m, String documentName,
        BasicDBObject mongoQuery, String result, BasicDBObject orderBy, int maxResult, int firstResult,
        BasicDBObject keys, final KunderaMetadata kunderaMetadata)
        throws PropertyAccessException, InstantiationException, IllegalAccessException {
    List list = new ArrayList();// List of embedded object to be returned

    // Specified after entity alias in query
    String columnName = result;

    // Something user didn't specify and we have to derive
    // TODO: User must specify this in query and remove this logic once
    // query format is changed

    String enclosingDocumentName = null;

    MetamodelImpl metaModel = (MetamodelImpl) kunderaMetadata.getApplicationMetadata()
            .getMetamodel(m.getPersistenceUnit());
    EntityType entityType = metaModel.entity(m.getEntityClazz());
    EmbeddableType superColumn = null;
    Set<Attribute> columns = null;
    Attribute attrib = null;
    try {
        attrib = entityType.getAttribute(columnName);
        Map<String, EmbeddableType> embeddables = metaModel.getEmbeddables(m.getEntityClazz());
        for (String key : embeddables.keySet()) {
            superColumn = embeddables.get(key);
            columns = superColumn.getAttributes();

            for (Attribute column : columns) {
                if (((AbstractAttribute) column).getJPAColumnName().equals(columnName)) {
                    enclosingDocumentName = key;
                    break;
                }
            }
        }
    } catch (IllegalArgumentException iax) {
        if (log.isWarnEnabled()) {
            log.warn("No column found for: " + columnName);
        }
    }

    // Query for fetching entities based on user specified criteria
    DBCursor cursor = orderBy != null ? dbCollection.find(mongoQuery, keys).sort(orderBy)
            : dbCollection.find(mongoQuery, keys).limit(maxResult).skip(firstResult);

    if (superColumn != null) {
        Field superColumnField = (Field) attrib.getJavaMember();
        while (cursor.hasNext()) {
            DBObject fetchedDocument = cursor.next();
            Object embeddedDocumentObject = fetchedDocument.get(superColumnField.getName());

            if (embeddedDocumentObject != null) {
                if (embeddedDocumentObject instanceof BasicDBList) {
                    Class embeddedObjectClass = PropertyAccessorHelper.getGenericClass(superColumnField);
                    for (Object dbObj : (BasicDBList) embeddedDocumentObject) {
                        Object obj = embeddedObjectClass.newInstance();
                        Object embeddedObject = new DocumentObjectMapper().getObjectFromDocument(metaModel,
                                (BasicDBObject) dbObj, superColumn.getAttributes(), obj);
                        Object fieldValue = PropertyAccessorHelper.getObject(embeddedObject, columnName);
                    }
                } else if (embeddedDocumentObject instanceof BasicDBObject) {
                    Object obj = superColumn.getJavaType().newInstance();
                    Object embeddedObject = DocumentObjectMapper.getObjectFromDocument(metaModel,
                            (BasicDBObject) embeddedDocumentObject, superColumn.getAttributes(), obj);
                    list.add(embeddedObject);
                } else {
                    throw new PersistenceException("Can't retrieve embedded object from MONGODB document coz "
                            + "it wasn't stored as BasicDBObject, possible problem in format.");
                }
            }
        }
    }
    return list;
}