List of usage examples for com.mongodb DBCollection find
public DBCursor find(final DBObject query)
From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java
License:Open Source License
public static Map<String, Set<String>> findAliases(DBCollection entityFeatureDb, String field, Collection<String> terms, String userIdStr, String communityIdStrList) { Map<String, Set<String>> aliases = new HashMap<String, Set<String>>(); String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList); try {//from w w w .j av a2 s.c o m if (null == entityFeatureDb) { entityFeatureDb = DbManager.getFeature().getEntity(); } // Get all the aliases in one go, will sort them out later BasicDBObject query = new BasicDBObject(); query.put(field, new BasicDBObject(MongoDbManager.in_, terms)); ObjectId[] communityIds = new ObjectId[communityIdStrs.length]; int i = 0; for (String idStr : communityIdStrs) { communityIds[i] = new ObjectId(idStr); i++; } query.put(EntityFeaturePojo.communityId_, new BasicDBObject(MongoDbManager.in_, communityIds)); List<EntityFeaturePojo> gpl = EntityFeaturePojo.listFromDb(entityFeatureDb.find(query), EntityFeaturePojo.listType()); for (String s : terms) { aliases.put(s, new HashSet<String>()); for (EntityFeaturePojo gpit : gpl) { if ((field.equals(EntityFeaturePojo.index_) && gpit.getIndex().equals(s)) // gazname || (field.equals(EntityFeaturePojo.disambiguated_name_) && gpit.getDisambiguatedName().equals(s)) // alias || (field.equals(EntityFeaturePojo.alias_) && gpit.getAlias().contains(s))) // alias { aliases.get(s).addAll(gpit.getAlias()); } } } } catch (Exception e) { logger.error("Exception Message: " + e.getMessage(), e); } return aliases; }
From source file:com.ikanow.infinit.e.api.utils.RESTTools.java
License:Open Source License
/** * Creates a new session for a user, adding * an entry to our cookie table (maps cookieid * to userid) and starts the clock//from ww w . jav a2 s . c o m * * @param username * @param bMulti if true lets you login from many sources * @param bOverride if false will fail if already logged in * @return */ public static ObjectId createSession(ObjectId userid, boolean bMulti, boolean bOverride) { try { DBCollection cookieColl = DbManager.getSocial().getCookies(); if (!bMulti) { // Otherwise allow multiple cookies for this user //remove any old cookie for this user BasicDBObject dbQuery = new BasicDBObject(); dbQuery.put("profileId", userid); dbQuery.put("apiKey", new BasicDBObject(DbManager.exists_, false)); DBCursor dbc = cookieColl.find(dbQuery); if (bOverride) { while (dbc.hasNext()) { cookieColl.remove(dbc.next()); } } //TESTED else if (dbc.length() > 0) { return null; } //TESTED } //Find user //create a new entry CookiePojo cp = new CookiePojo(); ObjectId randomObjectId = generateRandomId(); cp.set_id(randomObjectId); cp.setCookieId(randomObjectId); cp.setLastActivity(new Date()); cp.setProfileId(userid); cp.setStartDate(new Date()); cookieColl.insert(cp.toDb()); //return cookieid return cp.getCookieId(); } catch (Exception e) { logger.error("Line: [" + e.getStackTrace()[2].getLineNumber() + "] " + e.getMessage()); e.printStackTrace(); } return null; }
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
/** * Moves the output of a job from output_tmp to output and deletes * the tmp collection./* w w w . j a v a2 s. co m*/ * * @param cmr * @throws IOException * @throws ParserConfigurationException * @throws SAXException */ private void moveTempOutput(CustomMapReduceJobPojo cmr) throws IOException, SAXException, ParserConfigurationException { // If we are an export job then move files: bringTempOutputToFront(cmr); // (the rest of this will just do nothing) /** * Atomic plan: * If not append, move customlookup pointer to tmp collection, drop old collection. * If append, set sync flag (find/mod), move results from tmp to old, unset sync flag. * */ //step1 build out any of the post proc arguments DBObject postProcObject = null; boolean limitAllData = true; boolean hasSort = false; int limit = 0; BasicDBObject sort = new BasicDBObject(); try { postProcObject = (DBObject) com.mongodb.util.JSON .parse(getQueryOrProcessing(cmr.query, QuerySpec.POSTPROC)); if (postProcObject != null) { if (postProcObject.containsField("limitAllData")) { limitAllData = (Boolean) postProcObject.get("limitAllData"); } if (postProcObject.containsField("limit")) { limit = (Integer) postProcObject.get("limit"); if (postProcObject.containsField("sortField")) { String sfield = (String) postProcObject.get("sortField"); int sortDir = 1; if (postProcObject.containsField("sortDirection")) { sortDir = (Integer) postProcObject.get("sortDirection"); } sort.put(sfield, sortDir); hasSort = true; } else if (limit > 0) { //set a default sort because the user posted a limit sort.put("_id", -1); hasSort = true; } } } } catch (Exception ex) { _logger.info( "job_error_post_proc_title=" + cmr.jobtitle + " job_error_post_proc_id=" + cmr._id.toString() + " job_error_post_proc_message=" + HarvestExceptionUtils.createExceptionMessage(ex)); } //step 2a if not appending results then work on temp collection and swap to main if ((null == cmr.appendResults) || !cmr.appendResults) //format temp then change lookup pointer to temp collection { //transform all the results into necessary format: DBCursor dbc_tmp = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp) .find(new BasicDBObject("key", null)).sort(sort).limit(limit); while (dbc_tmp.hasNext()) { DBObject dbo = dbc_tmp.next(); Object key = dbo.get("_id"); dbo.put("key", key); dbo.removeField("_id"); DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp).insert(dbo); } DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp) .remove(new BasicDBObject("key", null)); //swap the output collections BasicDBObject notappendupdates = new BasicDBObject(CustomMapReduceJobPojo.outputCollection_, cmr.outputCollectionTemp); notappendupdates.append(CustomMapReduceJobPojo.outputCollectionTemp_, cmr.outputCollection); DbManager.getCustom().getLookup().findAndModify(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id), new BasicDBObject(MongoDbManager.set_, notappendupdates)); String temp = cmr.outputCollectionTemp; cmr.outputCollectionTemp = cmr.outputCollection; cmr.outputCollection = temp; } else //step 2b if appending results then drop modified results in output collection { DbManager.getCustom().getLookup().findAndModify(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id), new BasicDBObject(MongoDbManager.set_, new BasicDBObject("isUpdatingOutput", true))); //remove any aged out results if ((null != cmr.appendAgeOutInDays) && cmr.appendAgeOutInDays > 0) { //remove any results that have aged out long ageOutMS = (long) (cmr.appendAgeOutInDays * MS_IN_DAY); Date lastAgeOut = new Date(((new Date()).getTime() - ageOutMS)); DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection).remove( new BasicDBObject("_id", new BasicDBObject(MongoDbManager.lt_, new ObjectId(lastAgeOut)))); } DBCursor dbc_tmp; if (!limitAllData) { //sort and limit the temp data set because we only want to process it dbc_tmp = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp) .find(new BasicDBObject("key", null)).sort(sort).limit(limit); limit = 0; //reset limit so we get everything in a few steps (we only want to limit the new data) } else { dbc_tmp = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp) .find(new BasicDBObject("key", null)); } DBCollection dbc = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection); //transform temp results and dump into output collection while (dbc_tmp.hasNext()) { DBObject dbo = dbc_tmp.next(); //transform the dbo to format {_id:ObjectId, key:(prev_id), value:value} Object key = dbo.get("_id"); dbo.put("key", key); dbo.removeField("_id"); //_id field should be automatically set to objectid when inserting now dbc.insert(dbo); } //if there is a sort, we need to apply it to all the data now if (hasSort) { ObjectId OID = new ObjectId(); BasicDBObject query = new BasicDBObject("_id", new BasicDBObject(MongoDbManager.lt_, OID)); //find everything inserted before now and sort/limit the data DBCursor dbc_sort = dbc.find(query).sort(sort).limit(limit); while (dbc_sort.hasNext()) { //reinsert the data into db (it should be in sorted order naturally now) DBObject dbo = dbc_sort.next(); dbo.removeField("_id"); dbc.insert(dbo); } //remove everything inserted before we reorganized everything (should leave only the new results in natural order) dbc.remove(query); } DbManager.getCustom().getLookup().findAndModify(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id), new BasicDBObject(MongoDbManager.set_, new BasicDBObject("isUpdatingOutput", false))); } //step3 clean up temp output collection so we can use it again // (drop it, removing chunks) try { DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp).drop(); } catch (Exception e) { } // That's fine, it probably just doesn't exist yet... }
From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java
License:Open Source License
/** * getGeoReference/*from www . j a v a2 s . co m*/ * @param cm * @param query * @param nMaxReturns * @return */ private static DBCursor getGeoReference(DBCollection geoDb, BasicDBObject query, int nMaxReturns) { if (nMaxReturns == -1) { return geoDb.find(query); } else { return geoDb.find(query).limit(nMaxReturns); } }
From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java
License:Open Source License
/** * getNearestCities// w w w . jav a 2 s .c o m * Get n-cities near a lat/lon pair, results returned ordered by distance from * the lat/lon pair * @param lat * @param lon * @param nMaxReturns * @return List<GeoReferencePojo> */ public static List<GeoFeaturePojo> getNearestCities(DBCollection geoDb, String lat, String lon, int nMaxReturns) { try { // Create Double[] from lat, lon Double[] d = new Double[] { Double.parseDouble(lat), Double.parseDouble(lon) }; // Build query object to return the shell equivalent of: // db.georeference.find({geoindex : {$near : [lat.lon]}}) BasicDBObject query = new BasicDBObject(); BasicDBObject near = new BasicDBObject(); near.append("$near", d); query.put("geoindex", near); // Perform query DBCursor result = geoDb.find(query).limit(nMaxReturns); // Convert results to List<GeoReferencePojo> List<GeoFeaturePojo> gpl = GeoFeaturePojo.listFromDb(result, new TypeToken<ArrayList<GeoFeaturePojo>>() { }); return gpl; } catch (Exception e) { return null; } }
From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java
License:Open Source License
@Override public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) { if (_isDirectory) { if (_isShare) { // must be a zip file ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>(); @SuppressWarnings("unchecked") Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries(); while (entries.hasMoreElements()) { net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement(); InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName()); zipFiles.add(newFile);/*from w w w.j a v a 2 s.co m*/ } return zipFiles.toArray(new InfiniteFile[zipFiles.size()]); } //TESTED (3.2) else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory" String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_); String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_); if (null == outputDatabase) { outputDatabase = "custommr"; } DBCollection outColl = null; DBCursor dbc = null; if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory DBCollection chunks = MongoDbManager.getCollection("config", "chunks"); StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection); dbc = chunks.find(new BasicDBObject("ns", ns.toString())); int splits = dbc.count(); if (splits < 2) { // Nothing to do (unsharded or 1 chunk) dbc.close(); outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(); } //TESTED (4.2) else { // Create one virtual dir per split InfiniteFile[] virtualDirs = new InfiniteFile[splits]; int added = 0; for (DBObject splitObj : dbc) { BasicDBObject minObj = (BasicDBObject) splitObj.get("min"); BasicDBObject maxObj = (BasicDBObject) splitObj.get("max"); ObjectId minId = null; try { minId = (ObjectId) minObj.get("_id"); } catch (Exception e) { } // min key.. ObjectId maxId = null; try { maxId = (ObjectId) maxObj.get("_id"); } catch (Exception e) { } // max key.. //Handle current case where custom jobs are all dumped in with the wrong _id type if ((null != minId) || (null != maxId)) { if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below) if (maxId.getTime() < optionalFilterDate.getTime()) { // (the "getTime()"s can overlap across chunks so we have to use minId // and accept that we'll often deserialize 1+ extra chunk every harvest) continue; } } //TESTED (by hand) InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId); virtualDirs[added] = split; added++; } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand) } dbc.close(); return virtualDirs; } //TESTED (5.2.2, 6.2.2) } //TESTED else { // Virtual directory BasicDBObject query = new BasicDBObject(); if (null != _virtualDirStartLimit) { if (null != optionalFilterDate) { ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); //(zero out the inc/machine ids so this query is independent to calling service) if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit) query.put(MongoDbManager.gte_, altStartId); } else { query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } //TESTED (by hand) else { // normal case query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable) ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); query.put(MongoDbManager.gte_, altStartId); } //TESTED (by hand) if (null != _virtualDirEndLimit) { query.put(MongoDbManager.lt_, _virtualDirEndLimit); } outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle); } //TESTED (6.2.2) (doc skipping by hand) if (null != outColl) { // has files, create the actual file objects //DEBUG //System.out.println("CHUNK: GOT " + dbc.count()); int docCount = dbc.count(); if (docCount > 1 + maxDocsPerCycle) { docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway) } InfiniteFile[] docs = new InfiniteFile[docCount]; int added = 0; for (DBObject docObj : dbc) { // (if didn't use a query then apply internal filter date by hand) if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit) && (null != optionalFilterDate)) { ObjectId docId = (ObjectId) docObj.get("_id"); if (optionalFilterDate.getTime() > docId.getTime()) { continue; } } //TESTED if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to) docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"), _virtualDirEndLimit); break; } else { InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj); docs[added] = doc; } //TESTED (both cases) added++; } dbc.close(); return docs; } //TESTED (4.2) } } else { // can just return myself InfiniteFile[] retVal = new InfiniteFile[1]; retVal[0] = this; return retVal; } //TESTED (1.2, 2.2) return null; }
From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection eventFeatureDB = DbManager.getFeature().getAssociation(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex("association_index"); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping, localSettings);/*from ww w .ja v a 2 s . co m*/ // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex("association_index"); } // Now query the DB: DBCursor dbc = null; dbc = eventFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>(); int nSynced = 0; // Loop over array and invoke the cleansing function for each one while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class); // If this table has just been rebuilt from the document then the indexes are all wrong ... // recalculate and save if ('#' == evt.getIndex().charAt(0)) { AssociationPojo singleEvt = new AssociationPojo(); singleEvt.setEntity1_index(evt.getEntity1_index()); singleEvt.setEntity2_index(evt.getEntity2_index()); singleEvt.setVerb_category(evt.getVerb_category()); singleEvt.setGeo_index(evt.getGeo_index()); evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt)); eventFeatureDB .update(new BasicDBObject("_id", dbo.get("_id")), new BasicDBObject(MongoDbManager.set_, new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())), false, true); // (has to be a multi-update even though it's unique because it's sharded on index) } // Handle groups (system group is: "4c927585d591d31d7b37097a") if (null == evt.getCommunityId()) { evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } // Bulk add prep events.add(evt); nSynced++; if (events.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); events.clear(); } } // End loop over entities //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }
From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java
License:Apache License
private void doDelete(BasicDBObject query, int nLimit) { try {//w w w . j a va 2 s. c o m // Initialize the DB: DBCollection eventFeatureDB = DbManager.getFeature().getAssociation(); DBCursor cur = eventFeatureDB.find(query).limit(nLimit); // (this internally works in batches of 1000; just get _id) System.out.println("Found " + cur.count() + " records to delete"); if (nLimit > 0) { System.out.println("(limited to " + nLimit + " records)"); } ArrayList<AssociationFeaturePojo> events = new ArrayList<AssociationFeaturePojo>(); LinkedList<String> eventIds = new LinkedList<String>(); while (cur.hasNext()) { AssociationFeaturePojo event = AssociationFeaturePojo.fromDb(cur.next(), AssociationFeaturePojo.class); events.add(event); eventIds.add( new StringBuffer(event.getIndex()).append(":").append(event.getCommunityId()).toString()); eventFeatureDB.remove(new BasicDBObject("index", event.getIndex())); } ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("association_index"); elasticManager.bulkDeleteDocuments(eventIds); } catch (NumberFormatException e) { e.printStackTrace(); } catch (MongoException e) { e.printStackTrace(); } }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
@SuppressWarnings("unused") private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort, BasicDBObject query, int nLimit) { ElasticSearchManager elasticManager = null; try {/*from w ww .j a v a 2 s. co m*/ // Initialize the DB: DBCollection feedsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); String indexName = "document_index"; // Test/debug recreate the index if (true) { // (delete the index) System.out.println("Deleting index..."); elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); elasticManager.deleteMe(); //(also deletes the child index - same index, different type) // Create the index if necessary String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 10).put("number_of_replicas", 2); System.out.println("Creating index..." + sMapping); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, sElasticHost + ":" + sElasticPort, sMapping, localSettings); } // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); } // Get the feeds from the DB: //Debug: // System.out.println("Querying DB..."); DBCursor dbc = feedsDB.find(query).limit(nLimit); byte[] storageArray = new byte[200000]; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); //Debug: System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl()); // Get the content: BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get("gzip_content")); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = gzip.read(storageArray, 0, 200000); String s = new String(storageArray, 0, nRead, "UTF-8"); doc.setFullText(s); } // Get tag: SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject("key", doc.getSourceKey())); if (null != srcDbo) { src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class); _sourceCache.put(doc.getSourceKey(), src); } } if (null != src) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } doc.setTags(tagsTidied); } //TEST: set dynamic field // Lots of testing of dynamic dates: // feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString())); // String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated()); // feed.addToMetadata("another_dateISO", s1); // String s1_5 = new SimpleDateFormat().format(feed.getCreated()); // feed.addToMetadata("another_dateTimeJava", s1_5); // String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated()); // feed.addToMetadata("another_dateYYYYMMDD", s2); // String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated()); // feed.addToMetadata("another_dateRFC822", s3); // feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString()); // // Testing of the string field types // feed.addToMetadata("my_comment", "Testing this ABCDEFG"); // feed.addToMetadata("my_term", "Testing this UVWXYZ"); // feed.addToMetadata("my_text", "Testing this 123456"); // // Test an array of longs: // Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L; // feed.addToMetadata("md_long", tl); //TEST: some dummy event timestamp adding code (not seeing much/any in the data) // if (null != feed.getEvents()) { // int i = 0; // for (EventPojo evt: feed.getEvents()) { // //1: Add single date // if (0 == i) { // evt.time_start = "2011-01-01"; // } // //2: Add short span // if (1 == i) { // evt.time_start = "2010-04-06"; // evt.time_end = "2010-08-09"; // } // //3: Add cross-yr span // if (2 == i) { // evt.time_start = "2012-06-05"; // evt.time_end = "2013-09-05"; // } // //4: Add too long span // if (3 == i) { // evt.time_start = "2012-04-06"; // evt.time_end = "2014-04-09"; // } // i++; // } // } // For event adding, see data_model.test.TestCode } } catch (IOException e) { e.printStackTrace(); } finally { //nothing to do } }
From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index String indexName = "entity_index"; ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex(indexName); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings); // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName); }/*from ww w . ja v a 2 s . com*/ // Now query the DB: DBCursor dbc = null; dbc = entityFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } int nSynced = 0; List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>(); while (dbc.hasNext()) { EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class); if (null != feature.getAlias()) { // (some corrupt gazateer entry) // Handle groups (system group is: "4c927585d591d31d7b37097a") // if there is no community id, add system group (something is wrong if this happens?) if (null == feature.getCommunityId()) { feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } } entities.add(feature); nSynced++; // Add the entities if (entities.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) entities = new ArrayList<EntityFeaturePojo>(); } } //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }