Example usage for com.mongodb DBCollection find

List of usage examples for com.mongodb DBCollection find


In this page you can find the example usage for com.mongodb DBCollection find.


public DBCursor find(final DBObject query) 

Source Link


Select documents in collection and get a cursor to the selected documents.


From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

public static Map<String, Set<String>> findAliases(DBCollection entityFeatureDb, String field,
        Collection<String> terms, String userIdStr, String communityIdStrList) {
    Map<String, Set<String>> aliases = new HashMap<String, Set<String>>();
    String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);
    try {//from w  w  w .j  av  a2  s.c o m
        if (null == entityFeatureDb) {
            entityFeatureDb = DbManager.getFeature().getEntity();

        // Get all the aliases in one go, will sort them out later
        BasicDBObject query = new BasicDBObject();
        query.put(field, new BasicDBObject(MongoDbManager.in_, terms));
        ObjectId[] communityIds = new ObjectId[communityIdStrs.length];
        int i = 0;
        for (String idStr : communityIdStrs) {
            communityIds[i] = new ObjectId(idStr);
        query.put(EntityFeaturePojo.communityId_, new BasicDBObject(MongoDbManager.in_, communityIds));

        List<EntityFeaturePojo> gpl = EntityFeaturePojo.listFromDb(entityFeatureDb.find(query),

        for (String s : terms) {
            aliases.put(s, new HashSet<String>());
            for (EntityFeaturePojo gpit : gpl) {
                if ((field.equals(EntityFeaturePojo.index_) && gpit.getIndex().equals(s)) // gazname
                        || (field.equals(EntityFeaturePojo.disambiguated_name_)
                                && gpit.getDisambiguatedName().equals(s)) // alias
                        || (field.equals(EntityFeaturePojo.alias_) && gpit.getAlias().contains(s))) // alias
    } catch (Exception e) {
        logger.error("Exception Message: " + e.getMessage(), e);
    return aliases;

From source file:com.ikanow.infinit.e.api.utils.RESTTools.java

License:Open Source License

 * Creates a new session for a user, adding
 * an entry to our cookie table (maps cookieid
 * to userid) and starts the clock//from   ww w  .  jav  a2 s  .  c  o m
 * @param username
 * @param bMulti if true lets you login from many sources
 * @param bOverride if false will fail if already logged in
 * @return
public static ObjectId createSession(ObjectId userid, boolean bMulti, boolean bOverride) {

    try {
        DBCollection cookieColl = DbManager.getSocial().getCookies();

        if (!bMulti) { // Otherwise allow multiple cookies for this user
            //remove any old cookie for this user
            BasicDBObject dbQuery = new BasicDBObject();
            dbQuery.put("profileId", userid);
            dbQuery.put("apiKey", new BasicDBObject(DbManager.exists_, false));
            DBCursor dbc = cookieColl.find(dbQuery);
            if (bOverride) {
                while (dbc.hasNext()) {
            } //TESTED
            else if (dbc.length() > 0) {
                return null;
            } //TESTED
        //Find user
        //create a new entry
        CookiePojo cp = new CookiePojo();
        ObjectId randomObjectId = generateRandomId();

        cp.setLastActivity(new Date());
        cp.setStartDate(new Date());
        //return cookieid
        return cp.getCookieId();
    } catch (Exception e) {
        logger.error("Line: [" + e.getStackTrace()[2].getLineNumber() + "] " + e.getMessage());

    return null;

From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java

License:Open Source License

 * Moves the output of a job from output_tmp to output and deletes
 * the tmp collection./* w w  w  .  j  a v  a2 s. co m*/
 * @param cmr
 * @throws IOException 
 * @throws ParserConfigurationException 
 * @throws SAXException 
private void moveTempOutput(CustomMapReduceJobPojo cmr)
        throws IOException, SAXException, ParserConfigurationException {
    // If we are an export job then move files:
    // (the rest of this will just do nothing) 

     * Atomic plan:
     * If not append, move customlookup pointer to tmp collection, drop old collection.
     * If append, set sync flag (find/mod), move results from tmp to old, unset sync flag.
    //step1 build out any of the post proc arguments
    DBObject postProcObject = null;
    boolean limitAllData = true;
    boolean hasSort = false;
    int limit = 0;
    BasicDBObject sort = new BasicDBObject();
    try {
        postProcObject = (DBObject) com.mongodb.util.JSON
                .parse(getQueryOrProcessing(cmr.query, QuerySpec.POSTPROC));
        if (postProcObject != null) {
            if (postProcObject.containsField("limitAllData")) {
                limitAllData = (Boolean) postProcObject.get("limitAllData");
            if (postProcObject.containsField("limit")) {
                limit = (Integer) postProcObject.get("limit");
                if (postProcObject.containsField("sortField")) {
                    String sfield = (String) postProcObject.get("sortField");
                    int sortDir = 1;
                    if (postProcObject.containsField("sortDirection")) {
                        sortDir = (Integer) postProcObject.get("sortDirection");
                    sort.put(sfield, sortDir);
                    hasSort = true;
                } else if (limit > 0) {
                    //set a default sort because the user posted a limit
                    sort.put("_id", -1);
                    hasSort = true;
    } catch (Exception ex) {
                "job_error_post_proc_title=" + cmr.jobtitle + " job_error_post_proc_id=" + cmr._id.toString()
                        + " job_error_post_proc_message=" + HarvestExceptionUtils.createExceptionMessage(ex));

    //step 2a if not appending results then work on temp collection and swap to main
    if ((null == cmr.appendResults) || !cmr.appendResults) //format temp then change lookup pointer to temp collection
        //transform all the results into necessary format:         
        DBCursor dbc_tmp = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp)
                .find(new BasicDBObject("key", null)).sort(sort).limit(limit);
        while (dbc_tmp.hasNext()) {
            DBObject dbo = dbc_tmp.next();
            Object key = dbo.get("_id");
            dbo.put("key", key);
            DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp).insert(dbo);
        DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp)
                .remove(new BasicDBObject("key", null));

        //swap the output collections
        BasicDBObject notappendupdates = new BasicDBObject(CustomMapReduceJobPojo.outputCollection_,
        notappendupdates.append(CustomMapReduceJobPojo.outputCollectionTemp_, cmr.outputCollection);
        DbManager.getCustom().getLookup().findAndModify(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id),
                new BasicDBObject(MongoDbManager.set_, notappendupdates));
        String temp = cmr.outputCollectionTemp;
        cmr.outputCollectionTemp = cmr.outputCollection;
        cmr.outputCollection = temp;
    } else //step 2b if appending results then drop modified results in output collection
        DbManager.getCustom().getLookup().findAndModify(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id),
                new BasicDBObject(MongoDbManager.set_, new BasicDBObject("isUpdatingOutput", true)));
        //remove any aged out results
        if ((null != cmr.appendAgeOutInDays) && cmr.appendAgeOutInDays > 0) {
            //remove any results that have aged out
            long ageOutMS = (long) (cmr.appendAgeOutInDays * MS_IN_DAY);
            Date lastAgeOut = new Date(((new Date()).getTime() - ageOutMS));
            DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection).remove(
                    new BasicDBObject("_id", new BasicDBObject(MongoDbManager.lt_, new ObjectId(lastAgeOut))));
        DBCursor dbc_tmp;
        if (!limitAllData) {
            //sort and limit the temp data set because we only want to process it
            dbc_tmp = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp)
                    .find(new BasicDBObject("key", null)).sort(sort).limit(limit);
            limit = 0; //reset limit so we get everything in a few steps (we only want to limit the new data)
        } else {
            dbc_tmp = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp)
                    .find(new BasicDBObject("key", null));

        DBCollection dbc = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection);
        //transform temp results and dump into output collection
        while (dbc_tmp.hasNext()) {
            DBObject dbo = dbc_tmp.next();
            //transform the dbo to format {_id:ObjectId, key:(prev_id), value:value}
            Object key = dbo.get("_id");
            dbo.put("key", key);
            //_id field should be automatically set to objectid when inserting now
        //if there is a sort, we need to apply it to all the data now
        if (hasSort) {
            ObjectId OID = new ObjectId();
            BasicDBObject query = new BasicDBObject("_id", new BasicDBObject(MongoDbManager.lt_, OID));
            //find everything inserted before now and sort/limit the data
            DBCursor dbc_sort = dbc.find(query).sort(sort).limit(limit);
            while (dbc_sort.hasNext()) {
                //reinsert the data into db (it should be in sorted order naturally now)
                DBObject dbo = dbc_sort.next();
            //remove everything inserted before we reorganized everything (should leave only the new results in natural order)
        DbManager.getCustom().getLookup().findAndModify(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id),
                new BasicDBObject(MongoDbManager.set_, new BasicDBObject("isUpdatingOutput", false)));
    //step3 clean up temp output collection so we can use it again
    // (drop it, removing chunks)
    try {
        DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp).drop();
    } catch (Exception e) {
    } // That's fine, it probably just doesn't exist yet...

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java

License:Open Source License

 * getGeoReference/*from   www  .  j  a  v  a2  s .  co  m*/
 * @param cm
 * @param query
 * @param nMaxReturns
 * @return
private static DBCursor getGeoReference(DBCollection geoDb, BasicDBObject query, int nMaxReturns) {
    if (nMaxReturns == -1) {
        return geoDb.find(query);
    } else {
        return geoDb.find(query).limit(nMaxReturns);

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java

License:Open Source License

 * getNearestCities//  w  w  w  .  jav  a  2 s .c  o  m
 * Get n-cities near a lat/lon pair, results returned ordered by distance from
 * the lat/lon pair
 * @param lat
 * @param lon
 * @param nMaxReturns
 * @return List<GeoReferencePojo>
public static List<GeoFeaturePojo> getNearestCities(DBCollection geoDb, String lat, String lon,
        int nMaxReturns) {
    try {
        // Create Double[] from lat, lon
        Double[] d = new Double[] { Double.parseDouble(lat), Double.parseDouble(lon) };

        // Build query object to return the shell equivalent of:
        // db.georeference.find({geoindex : {$near : [lat.lon]}})
        BasicDBObject query = new BasicDBObject();
        BasicDBObject near = new BasicDBObject();
        near.append("$near", d);
        query.put("geoindex", near);

        // Perform query
        DBCursor result = geoDb.find(query).limit(nMaxReturns);

        // Convert results to List<GeoReferencePojo>
        List<GeoFeaturePojo> gpl = GeoFeaturePojo.listFromDb(result,
                new TypeToken<ArrayList<GeoFeaturePojo>>() {
        return gpl;
    } catch (Exception e) {
        return null;

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java

License:Open Source License

public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) {
    if (_isDirectory) {
        if (_isShare) { // must be a zip file
            ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>();
            Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries();
            while (entries.hasMoreElements()) {
                net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement();
                InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName());
                zipFiles.add(newFile);/*from w w  w.j a v  a 2  s.co  m*/
            return zipFiles.toArray(new InfiniteFile[zipFiles.size()]);
        } //TESTED (3.2)
        else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory"
            String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_);
            String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_);
            if (null == outputDatabase) {
                outputDatabase = "custommr";
            DBCollection outColl = null;
            DBCursor dbc = null;
            if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory

                DBCollection chunks = MongoDbManager.getCollection("config", "chunks");
                StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection);
                dbc = chunks.find(new BasicDBObject("ns", ns.toString()));
                int splits = dbc.count();

                if (splits < 2) { // Nothing to do (unsharded or 1 chunk)

                    outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                    dbc = outColl.find();
                } //TESTED (4.2)
                else { // Create one virtual dir per split
                    InfiniteFile[] virtualDirs = new InfiniteFile[splits];
                    int added = 0;
                    for (DBObject splitObj : dbc) {
                        BasicDBObject minObj = (BasicDBObject) splitObj.get("min");
                        BasicDBObject maxObj = (BasicDBObject) splitObj.get("max");
                        ObjectId minId = null;
                        try {
                            minId = (ObjectId) minObj.get("_id");
                        } catch (Exception e) {
                        } // min key..
                        ObjectId maxId = null;
                        try {
                            maxId = (ObjectId) maxObj.get("_id");
                        } catch (Exception e) {
                        } // max key..

                        //Handle current case where custom jobs are all dumped in with the wrong _id type                     
                        if ((null != minId) || (null != maxId)) {
                            if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below)

                                if (maxId.getTime() < optionalFilterDate.getTime()) {
                                    // (the "getTime()"s can overlap across chunks so we have to use minId
                                    //  and accept that we'll often deserialize 1+ extra chunk every harvest)
                            } //TESTED (by hand)

                            InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId);
                            virtualDirs[added] = split;
                        } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand)
                    return virtualDirs;
                } //TESTED (5.2.2, 6.2.2)
            } //TESTED
            else { // Virtual directory
                BasicDBObject query = new BasicDBObject();
                if (null != _virtualDirStartLimit) {
                    if (null != optionalFilterDate) {
                        ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                        //(zero out the inc/machine ids so this query is independent to calling service)

                        if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit)
                            query.put(MongoDbManager.gte_, altStartId);
                        } else {
                            query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                    } //TESTED (by hand)
                    else { // normal case
                        query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable)
                    ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                    query.put(MongoDbManager.gte_, altStartId);
                } //TESTED (by hand)
                if (null != _virtualDirEndLimit) {
                    query.put(MongoDbManager.lt_, _virtualDirEndLimit);

                outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle);
            } //TESTED (6.2.2) (doc skipping by hand)

            if (null != outColl) { // has files, create the actual file objects
                //System.out.println("CHUNK: GOT " + dbc.count());

                int docCount = dbc.count();
                if (docCount > 1 + maxDocsPerCycle) {
                    docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway)
                InfiniteFile[] docs = new InfiniteFile[docCount];
                int added = 0;
                for (DBObject docObj : dbc) {
                    // (if didn't use a query then apply internal filter date by hand)
                    if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)
                            && (null != optionalFilterDate)) {
                        ObjectId docId = (ObjectId) docObj.get("_id");
                        if (optionalFilterDate.getTime() > docId.getTime()) {
                    } //TESTED

                    if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to)
                        docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"),
                    } else {
                        InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj);
                        docs[added] = doc;
                    } //TESTED (both cases)
                return docs;

            } //TESTED (4.2)
    } else { // can just return myself
        InfiniteFile[] retVal = new InfiniteFile[1];
        retVal[0] = this;
        return retVal;
    } //TESTED (1.2, 2.2)
    return null;

From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 


    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex("association_index");

    // Create the index if necessary
    String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping,
            localSettings);/*from ww w .ja  v a  2  s .  co m*/

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex("association_index");

    // Now query the DB:

    DBCursor dbc = null;
    dbc = eventFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...

    List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>();

    int nSynced = 0;

    // Loop over array and invoke the cleansing function for each one
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class);

        // If this table has just been rebuilt from the document then the indexes are all wrong ...
        // recalculate and save
        if ('#' == evt.getIndex().charAt(0)) {
            AssociationPojo singleEvt = new AssociationPojo();
                    .update(new BasicDBObject("_id", dbo.get("_id")),
                            new BasicDBObject(MongoDbManager.set_,
                                    new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())),
                            false, true);
            // (has to be a multi-update even though it's unique because it's sharded on index)

        // Handle groups (system group is: "4c927585d591d31d7b37097a")
        if (null == evt.getCommunityId()) {
            evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
        // Bulk add prep

        if (events.size() > 1000) {
                    AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null,
    // End loop over entities

    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(),
            new AssociationFeaturePojoIndexMap()), "_id", null, true);

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");

From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

License:Apache License

private void doDelete(BasicDBObject query, int nLimit) {
    try {//w  w  w  . j a va 2 s.  c  o m
        // Initialize the DB:   
        DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

        DBCursor cur = eventFeatureDB.find(query).limit(nLimit);
        // (this internally works in batches of 1000; just get _id)
        System.out.println("Found " + cur.count() + " records to delete");
        if (nLimit > 0) {
            System.out.println("(limited to " + nLimit + " records)");

        ArrayList<AssociationFeaturePojo> events = new ArrayList<AssociationFeaturePojo>();
        LinkedList<String> eventIds = new LinkedList<String>();
        while (cur.hasNext()) {
            AssociationFeaturePojo event = AssociationFeaturePojo.fromDb(cur.next(),
                    new StringBuffer(event.getIndex()).append(":").append(event.getCommunityId()).toString());
            eventFeatureDB.remove(new BasicDBObject("index", event.getIndex()));
        ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("association_index");

    } catch (NumberFormatException e) {
    } catch (MongoException e) {

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort,
        BasicDBObject query, int nLimit) {
    ElasticSearchManager elasticManager = null;

    try {/*from w ww .j a v a 2  s. co m*/
        // Initialize the DB:

        DBCollection feedsDB = DbManager.getDocument().getMetadata();
        DBCollection contentDB = DbManager.getDocument().getContent();
        DBCollection sourcesDB = DbManager.getIngest().getSource();

        String indexName = "document_index";

        // Test/debug recreate the index
        if (true) {

            // (delete the index)
            System.out.println("Deleting index...");
            elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
            //(also deletes the child index - same index, different type)

            // Create the index if necessary
            String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(),

            Builder localSettings = ImmutableSettings.settingsBuilder();
            localSettings.put("number_of_shards", 10).put("number_of_replicas", 2);

            System.out.println("Creating index..." + sMapping);
            elasticManager = ElasticSearchManager.createIndex(indexName, null, false,
                    sElasticHost + ":" + sElasticPort, sMapping, localSettings);

        // Get the index (necessary if already created)
        if (null == elasticManager) {
            elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);

        // Get the feeds from the DB:

        //         System.out.println("Querying DB...");

        DBCursor dbc = feedsDB.find(query).limit(nLimit);

        byte[] storageArray = new byte[200000];

        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);

            System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl());

            // Get the content:
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ);
            if (null != dboContent) {
                byte[] compressedData = ((byte[]) dboContent.get("gzip_content"));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = gzip.read(storageArray, 0, 200000);
                String s = new String(storageArray, 0, nRead, "UTF-8");
            // Get tag:
            SourcePojo src = _sourceCache.get(doc.getSourceKey());
            if (null == src) {
                BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                        .findOne(new BasicDBObject("key", doc.getSourceKey()));
                if (null != srcDbo) {
                    src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class);

                    _sourceCache.put(doc.getSourceKey(), src);
            if (null != src) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();

            //TEST: set dynamic field
            // Lots of testing of dynamic dates:
            //            feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString()));
            //            String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated());            
            //            feed.addToMetadata("another_dateISO", s1);
            //            String s1_5 = new SimpleDateFormat().format(feed.getCreated());
            //            feed.addToMetadata("another_dateTimeJava", s1_5);
            //            String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated());            
            //            feed.addToMetadata("another_dateYYYYMMDD", s2);
            //            String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated());
            //            feed.addToMetadata("another_dateRFC822", s3);
            //            feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString());
            //            // Testing of the string field types
            //            feed.addToMetadata("my_comment", "Testing this ABCDEFG");            
            //            feed.addToMetadata("my_term", "Testing this UVWXYZ");
            //            feed.addToMetadata("my_text", "Testing this 123456");            
            //            // Test an array of longs:
            //            Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L;
            //            feed.addToMetadata("md_long", tl);

            //TEST: some dummy event timestamp adding code (not seeing much/any in the data)
            //            if (null != feed.getEvents()) {
            //               int i = 0;
            //               for (EventPojo evt: feed.getEvents()) {
            //                  //1: Add single date
            //                  if (0 == i) {
            //                     evt.time_start = "2011-01-01";
            //                  }
            //                  //2: Add short span
            //                  if (1 == i) {
            //                     evt.time_start = "2010-04-06";
            //                     evt.time_end = "2010-08-09";
            //                  }
            //                  //3: Add cross-yr span
            //                  if (2 == i) {
            //                     evt.time_start = "2012-06-05";
            //                     evt.time_end = "2013-09-05";
            //                  }
            //                  //4: Add too long span
            //                  if (3 == i) {
            //                     evt.time_start = "2012-04-06";
            //                     evt.time_end = "2014-04-09";
            //                  }
            //                  i++;
            //               }
            //            }

            // For event adding, see data_model.test.TestCode
    } catch (IOException e) {
    } finally {
        //nothing to do

From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection entityFeatureDB = DbManager.getFeature().getEntity();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 

    String indexName = "entity_index";

    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex(indexName);

    // Create the index if necessary
    String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings);

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex(indexName);
    }/*from ww w  . ja  v a  2 s  . com*/

    // Now query the DB:

    DBCursor dbc = null;
    dbc = entityFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...

    int nSynced = 0;

    List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>();
    while (dbc.hasNext()) {
        EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class);

        if (null != feature.getAlias()) { // (some corrupt gazateer entry)

            // Handle groups (system group is: "4c927585d591d31d7b37097a")
            // if there is no community id, add system group (something is wrong if this happens?)
            if (null == feature.getCommunityId()) {
                feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));


        // Add the entities
        if (entities.size() > 1000) {
            elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
                    new EntityFeaturePojoIndexMap()), "_id", null, true);
            // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

            entities = new ArrayList<EntityFeaturePojo>();
    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
            new EntityFeaturePojoIndexMap()), "_id", null, true);
    // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");