List of usage examples for com.mongodb BasicDBObject get
public Object get(final String key)
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable, String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer, String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue, String arguments, Boolean incrementalMode, ObjectId userId, Boolean selfMerge, String originalOutputCollection, Boolean appendResults) throws IOException { String dbserver = prop_general.getDatabaseServer(); output = outputDatabase + "." + tempOutputCollection; boolean isAdmin = AuthUtils.isAdmin(userId); int nSplits = 8; int nDocsPerSplit = 12500; //add communities to query if this is not a custom table BasicDBObject oldQueryObj = null; BasicDBObject srcTags = null;// w ww . ja va 2s .co m // Start with the old query: if (query.startsWith("{")) { oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query); } else { oldQueryObj = new BasicDBObject(); } boolean elasticsearchQuery = oldQueryObj.containsField("qt") && !isCustomTable; int nLimit = 0; if (oldQueryObj.containsField("$limit")) { nLimit = oldQueryObj.getInt("$limit"); oldQueryObj.remove("$limit"); } if (oldQueryObj.containsField("$splits")) { nSplits = oldQueryObj.getInt("$splits"); oldQueryObj.remove("$splits"); } if (oldQueryObj.containsField("$srctags")) { srcTags = new BasicDBObject(SourcePojo.tags_, oldQueryObj.get("$srctags")); oldQueryObj.remove("$srctags"); } if (bLocalMode) { // If in local mode, then set this to a large number so we always run inside our limit/split version // (since for some reason MongoInputFormat seems to fail on large collections) nSplits = InfiniteMongoSplitter.MAX_SPLITS; } if (oldQueryObj.containsField("$docsPerSplit")) { nDocsPerSplit = oldQueryObj.getInt("$docsPerSplit"); oldQueryObj.remove("$docsPerSplit"); } oldQueryObj.remove("$fields"); oldQueryObj.remove("$output"); oldQueryObj.remove("$reducers"); String mapperKeyClass = oldQueryObj.getString("$mapper_key_class", ""); String mapperValueClass = oldQueryObj.getString("$mapper_value_class", ""); oldQueryObj.remove("$mapper_key_class"); oldQueryObj.remove("$mapper_value_class"); String cacheList = null; Object cacheObj = oldQueryObj.get("$caches"); if (null != cacheObj) { cacheList = cacheObj.toString(); // (either array of strings, or single string) if (!cacheList.startsWith("[")) { cacheList = "[" + cacheList + "]"; // ("must" now be valid array) } oldQueryObj.remove("$caches"); } //TESTED if (null != nDebugLimit) { // (debug mode override) nLimit = nDebugLimit; } boolean tmpIncMode = (null != incrementalMode) && incrementalMode; Date fromOverride = null; Date toOverride = null; Object fromOverrideObj = oldQueryObj.remove("$tmin"); Object toOverrideObj = oldQueryObj.remove("$tmax"); if (null != fromOverrideObj) { fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true); } if (null != toOverrideObj) { toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false); } if (!isCustomTable) { if (elasticsearchQuery) { oldQueryObj.put("communityIds", communityIds); //tmin/tmax not supported - already have that capability as part of the query } else { if (input.equals("feature.temporal")) { if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("value.maxTime", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, true)); } //TESTED oldQueryObj.put("_id.c", new BasicDBObject(DbManager.in_, communityIds)); } else { oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false)); } //TESTED if (input.equals("doc_metadata.metadata")) { oldQueryObj.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (ensures not soft-deleted) } } } } else { if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false)); } //TESTED //get the custom table (and database) input = CustomOutputManager.getCustomDbAndCollection(input); } query = oldQueryObj.toString(); if (arguments == null) arguments = ""; // Generic configuration out.write("<?xml version=\"1.0\"?>\n<configuration>"); // Mongo specific configuration out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title + "</value></property>" + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>" + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>" + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://" + dbserver + "/" + input + "</value></property>" + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://" + dbserver + "/" + output + "</value> </property>" + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>" + StringEscapeUtils.escapeXml(query) + "</value></property>" + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>" + ((fields == null) ? ("") : fields) + "</value></property>" + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>" + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>" + nLimit + "</value><!-- 0 == no limit --></property>" + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>" + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper + "</value></property>" + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer + "</value></property>" + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>" + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat</value></property>" + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>" + outputKey + "</value></property>" + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>" + outputValue + "</value></property>" + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value>" + mapperKeyClass + "</value></property>" + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value>" + mapperValueClass + "</value></property>" + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>" + combiner + "</value></property>" + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>" + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>" + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>"); // Infinit.e specific configuration out.write("\n\t<property><!-- User Arguments [optional] --><name>infinit.e.userid</name><value>" + StringEscapeUtils.escapeXml(userId.toString()) + "</value></property>" + "\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>" + StringEscapeUtils.escapeXml(arguments) + "</value></property>" + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>" + nSplits + "</value></property>" + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>" + nDocsPerSplit + "</value></property>" + "\n\t<property><!-- Infinit.e incremental mode [optional] --><name>update.incremental</name><value>" + tmpIncMode + "</value></property>" + "\n\t<property><!-- Infinit.e quick admin check [optional] --><name>infinit.e.is.admin</name><value>" + isAdmin + "</value></property>" + "\n\t<property><!-- Infinit.e userid [optional] --><name>infinit.e.userid</name><value>" + userId + "</value></property>"); if (null != cacheList) { out.write( "\n\t<property><!-- Infinit.e cache list [optional] --><name>infinit.e.cache.list</name><value>" + cacheList + "</value></property>"); } //TESTED if (null != srcTags) { out.write( "\n\t<property><!-- Infinit.e src tags filter [optional] --><name>infinit.e.source.tags.filter</name><value>" + srcTags.toString() + "</value></property>"); } if (null != selfMerge && selfMerge && originalOutputCollection != null) { originalOutputCollection = "mongodb://" + dbserver + "/" + outputDatabase + "." + originalOutputCollection; out.write( "\n\t<property><!-- This jobs output collection for passing into the mapper along with input collection [optional] --><name>infinit.e.selfMerge</name><value>" + originalOutputCollection + "</value></property>"); } // Closing thoughts: out.write("\n</configuration>"); out.flush(); out.close(); }
From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java
License:Open Source License
public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields) throws SAXException, IOException, ParserConfigurationException { BasicDBList dbl = new BasicDBList(); PropertiesManager props = new PropertiesManager(); Configuration conf = getConfiguration(props); Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false); @SuppressWarnings({ "unchecked", "rawtypes" }) SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable( pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf); // Very basic, only allow top level, 1 level of nesting, and field removal HashSet<String> fieldLookup = null; if (null != fields) { fieldLookup = new HashSet<String>(); String[] fieldArray = fields.split(","); for (String field : fieldArray) { String[] fieldDecomp = field.split(":"); fieldLookup.add(fieldDecomp[0]); }//from ww w . ja v a2 s .c o m } //TOTEST int nRecords = 0; for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) { BasicDBObject element = new BasicDBObject(); // KEY Writable key = record.getFirst(); if (key instanceof org.apache.hadoop.io.Text) { org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key; element.put("key", writable.toString()); } else if (key instanceof org.apache.hadoop.io.DoubleWritable) { org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key; element.put("key", Double.toString(writable.get())); } else if (key instanceof org.apache.hadoop.io.IntWritable) { org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key; element.put("key", Integer.toString(writable.get())); } else if (key instanceof org.apache.hadoop.io.LongWritable) { org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key; element.put("key", Long.toString(writable.get())); } else if (key instanceof BSONWritable) { element.put("key", MongoDbUtil.convert((BSONWritable) key)); } // VALUE Writable value = record.getSecond(); if (value instanceof org.apache.hadoop.io.Text) { org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value; element.put("value", writable.toString()); } else if (value instanceof org.apache.hadoop.io.DoubleWritable) { org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value; element.put("value", Double.toString(writable.get())); } else if (value instanceof org.apache.hadoop.io.IntWritable) { org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value; element.put("value", Integer.toString(writable.get())); } else if (value instanceof org.apache.hadoop.io.LongWritable) { org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value; element.put("value", Long.toString(writable.get())); } else if (value instanceof BSONWritable) { element.put("value", MongoDbUtil.convert((BSONWritable) value)); } else if (value instanceof org.apache.mahout.math.VectorWritable) { Vector vec = ((org.apache.mahout.math.VectorWritable) value).get(); BasicDBList dbl2 = listFromMahoutVector(vec, "value", element); element.put("value", dbl2); } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) { org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value; element.put("valueWeight", vecW.getWeight()); BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element); element.put("value", dbl2); } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) { Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue(); BasicDBObject clusterVal = new BasicDBObject(); clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal)); clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal)); element.put("value", clusterVal); } else { element.put("unknownValue", value.getClass().toString()); } // Check the fields settings: // Only handle a few... if (null != fieldLookup) { for (String fieldToRemove : fieldLookup) { if (fieldToRemove.startsWith("value.")) { fieldToRemove = fieldToRemove.substring(6); BasicDBObject nested = (BasicDBObject) element.get("value."); if (null != nested) { nested.remove(fieldToRemove); } } else { element.remove(fieldToRemove); } } //TOTEST } dbl.add(element); nRecords++; if ((nLimit > 0) && (nRecords >= nLimit)) { break; } } return dbl; }
From source file:com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchHadoopUtils.java
License:Apache License
public static void handleElasticsearchInput(CustomMapReduceJobPojo job, Configuration config, BasicDBObject advancedConfigurationDbo) { // Pull out type list: Object o = advancedConfigurationDbo.remove("$types"); String[] types = null;//from w ww .j av a 2s .co m if (null != o) { if (o instanceof BasicDBList) { types = ((BasicDBList) o).toArray(new String[0]); } else if (o instanceof String) { types = ((String) o).split("\\s*,\\s*"); } } //TESTED (by hand) //QUERY: // Date override: Date fromOverride = null; Date toOverride = null; Object fromOverrideObj = advancedConfigurationDbo.remove("$tmin"); Object toOverrideObj = advancedConfigurationDbo.remove("$tmax"); if (null != fromOverrideObj) { fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true); } if (null != toOverrideObj) { toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false); } Boolean streaming = null; Object streamingObj = advancedConfigurationDbo.remove("$streaming"); if (streamingObj instanceof Boolean) { streaming = (Boolean) streamingObj; } //DEBUG //System.out.println("QUERY = " + advancedConfigurationDbo.toString()); BasicDBObject newQuery = new BasicDBObject(); Object queryObj = advancedConfigurationDbo.get("query"); if (queryObj instanceof String) { config.set("es.query", queryObj.toString()); // URL version) if ((null != fromOverride) || (null != toOverride)) { throw new RuntimeException( "Can't specify $tmin/$tmax shortcut in conjunction with 'URL' query type"); } //TESTED } else if (null != queryObj) { newQuery.put("query", queryObj); Object filterObj = advancedConfigurationDbo.get("filter"); if (null != filterObj) newQuery.put("filter", filterObj); // (doesn't matter if it doesn't exist) Object fieldsObj = advancedConfigurationDbo.get("fields"); if (null != fieldsObj) newQuery.put("fields", fieldsObj); // (doesn't matter if it doesn't exist) Object sizeObj = advancedConfigurationDbo.get("size"); if (null != sizeObj) newQuery.put("size", sizeObj); // (doesn't matter if it doesn't exist) if ((null != fromOverride) || (null != toOverride)) { if (null == filterObj) { BasicDBObject filterRangeParamsDbo = new BasicDBObject(); if (null != fromOverride) { filterRangeParamsDbo.put("gte", fromOverride.getTime()); } if (null != toOverride) { filterRangeParamsDbo.put("lte", toOverride.getTime()); } BasicDBObject filterRangeDbo = new BasicDBObject("@timestamp", filterRangeParamsDbo); BasicDBObject filterDbo = new BasicDBObject("range", filterRangeDbo); newQuery.put("filter", filterDbo); } else { // combine filter throw new RuntimeException( "Can't (currently) specify $tmin/$tmax shortcut in conjunction with filter"); } //TESTED } config.set("es.query", newQuery.toString()); } //(else no query == match all) //COMMUNITIES Pattern dateRegex = null; ThreadSafeSimpleDateFormat tssdf = null; if ((null != fromOverride) || (null != toOverride)) { dateRegex = Pattern.compile("[0-9]{4}[.][0-9]{2}[.][0-9]{2}"); tssdf = new ThreadSafeSimpleDateFormat("yyyy.MM.dd"); } //TESTED StringBuffer overallIndexNames = new StringBuffer(); for (ObjectId commId : job.communityIds) { StringBuffer indexNames = new StringBuffer(); //TODO (INF-2641): need to handle: //c) anyway to sub-query?! (look for communityIds term?!) if (null == streaming) { indexNames.append("recs_*").append(commId.toString()).append("*"); } else if (streaming) { indexNames.append("recs_t_").append(commId.toString()).append("*"); } else {// !streaming indexNames.append("recs_").append(commId.toString()); } //TESTED StringBuffer decomposedIndexes = new StringBuffer(); boolean needDecomposedIndexes = false; HashSet<String> typesAdded = new HashSet<String>(); if ((null != types) && (null == fromOverride) && (null == toOverride)) { // (types manual, no date filtering - can be much simpler) for (String s : types) typesAdded.add(s); } else { // (All this oddly written code is to minimize the number of es types that get exposed, because // they are really badly behaved in terms of bw compatbility) if (null != types) { for (String s : types) typesAdded.add(s); } ElasticSearchManager indexMgr = ElasticSearchManager.getIndex("doc_dummy"); // (index guaranteed to exist) Object[] indexMetaObj = indexMgr.getRawClient().admin().cluster().prepareState() .setIndices(indexNames.toString()).setRoutingTable(false).setNodes(false) .setListenerThreaded(false).get().getState().getMetaData().getIndices().values().toArray(); if (null != indexMetaObj) for (Object oo : indexMetaObj) { IndexMetaData indexMeta = (IndexMetaData) oo; String indexName = indexMeta.getIndex(); if ((null != fromOverride) || (null != toOverride)) { //DEBUG //System.out.println("INDEX: " + indexName); Matcher m = dateRegex.matcher(indexName); if (m.find()) { try { Date d = tssdf.parse(m.group()); long endpoint = d.getTime() + 24L * 3600L * 1000L - 1; //DEBUG //System.out.println("***************** COMPARE: " + d + " FROM " + fromOverride + " TO " + toOverride + "..errr . " + m.group()); if (null != fromOverride) { if (endpoint < fromOverride.getTime()) { // no overlap on the left needDecomposedIndexes = true; continue; } } //TESTED if (null != toOverride) { if (d.getTime() > toOverride.getTime()) { // no overlap on the right needDecomposedIndexes = true; continue; } } //TESTED } catch (ParseException e) { // just carry on, odd index name, it happens needDecomposedIndexes = true; continue; } } } //TESTED (end loop over time checking) if (null == types) { Iterator<String> typesIt = indexMeta.getMappings().keysIt(); while (typesIt.hasNext()) { String type = typesIt.next(); if (!type.equals("_default_")) { typesAdded.add(type); } } } if (0 != decomposedIndexes.length()) { decomposedIndexes.append(','); } decomposedIndexes.append(indexName); } //(end loop over indexes) } //(end if need to derive the types from the indexes) if (needDecomposedIndexes) { // (because we filtered some indexes out) indexNames = decomposedIndexes; } if (0 == indexNames.length()) { continue; // nothing to do here... } int numTypesAdded = 0; if (typesAdded.isEmpty()) { // there doesn't seem to be any types associated with this set of indexes continue; // (ie don't add) } else for (String type : typesAdded) { if (numTypesAdded > 0) { indexNames.append(","); } else { indexNames.append("/"); } numTypesAdded++; indexNames.append(type); } if (overallIndexNames.length() > 0) { overallIndexNames.append(",,"); } overallIndexNames.append(indexNames); } //(end loop over community) //TESTED (by hand) if (0 == overallIndexNames.length()) { throw new RuntimeException( "Communities contained no types, either all indexes empty, or index is corrupt"); } //TESTED (by hand) //DEBUG //System.out.println("INDEXES = " + overallIndexNames.toString()); config.set("es.resource", overallIndexNames.toString()); config.set("es.index.read.missing.as.empty", "yes"); //proxy if running in debug mode: if (InfiniteEsInputFormat.LOCAL_DEBUG_MODE) { config.set("es.net.proxy.http.host", "localhost"); config.set("es.net.proxy.http.port", "8888"); } //TESTED (by hand) }
From source file:com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager.java
License:Open Source License
public static void updateDocEntitiesFromDeletedDocuments(String uuid) { String outCollection = new StringBuilder(uuid).append("_AggregationUtils").toString(); try {//from w w w. j a v a 2 s . c o m PropertiesManager props = new PropertiesManager(); if (props.getAggregationDisabled()) { // (no need to do this) return; } DBCollection outColl = DbManager.getDB("doc_metadata").getCollection(outCollection); DBCursor dbc = outColl.find(); for (DBObject dbo : dbc) { BasicDBObject entityEl = (BasicDBObject) dbo; BasicDBObject entityVal = (BasicDBObject) entityEl.get("value"); long nDocDecrement = entityVal.getLong("dc"); long nFreqDecrement = entityVal.getLong("f"); long nCurrFreq = entityVal.getLong("tf"); long nCurrDocCount = entityVal.getLong("tdc"); // (These are by construction the lowest values so this will provide some defence against going -ve) if (nDocDecrement > nCurrDocCount) { nDocDecrement = nCurrDocCount; } if (nFreqDecrement > nCurrFreq) { nFreqDecrement = nCurrFreq; } BasicDBObject entityId = (BasicDBObject) entityEl.get("_id"); ObjectId commId = null; Object commObj = entityId.get("comm"); if (commObj instanceof ObjectId) { commId = entityId.getObjectId("comm"); } String index = (String) entityId.get("index"); if ((null == index) || (null == commId)) { continue; // random error } BasicDBObject updateQuery = new BasicDBObject(EntityFeaturePojo.index_, index); updateQuery.put(EntityFeaturePojo.communityId_, commId); BasicDBObject entityUpdate1 = new BasicDBObject(EntityFeaturePojo.doccount_, -nDocDecrement); entityUpdate1.put(EntityFeaturePojo.totalfreq_, -nFreqDecrement); BasicDBObject entityUpdate = new BasicDBObject(DbManager.inc_, entityUpdate1); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println("UPDATE FEATURE DATABASE: " + updateQuery.toString() + "/" + entityUpdate.toString()); } else { DbManager.getFeature().getEntity().update(updateQuery, entityUpdate); // (can be a single query because the query is on index, the shard) } //TESTED if ((nDocDecrement < nCurrDocCount) && (nDocDecrement * 10 > nCurrDocCount)) { // ie there are some documents left // and the doc count has shifted by more than 10% BasicDBObject updateQuery2 = new BasicDBObject(EntityPojo.docQuery_index_, index); updateQuery2.put(DocumentPojo.communityId_, commId); BasicDBObject entityUpdate2_1 = new BasicDBObject(EntityPojo.docUpdate_doccount_, nCurrDocCount - nDocDecrement); entityUpdate2_1.put(EntityPojo.docUpdate_totalfrequency_, nCurrFreq - nFreqDecrement); BasicDBObject entityUpdate2 = new BasicDBObject(DbManager.set_, entityUpdate2_1); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println("UPDATE DOC DATABASE: " + updateQuery2.toString() + "/" + entityUpdate2.toString()); } else { DbManager.getDocument().getMetadata().update(updateQuery2, entityUpdate2, false, true); } } } //TESTED (including when to update logic above) } catch (Exception e) { e.printStackTrace(); } // Tidy up DbManager.getDB("doc_metadata").getCollection(outCollection).drop(); }
From source file:com.ikanow.infinit.e.processing.generic.GenericProcessingController.java
License:Open Source License
public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature, boolean bRebuildDocsIndex) { try { //create elasticsearch indexes if (!ElasticSearchManager.pingIndex(null, null)) { throw new RuntimeException("Index is red, disable indexing operations"); } //TESTED PropertiesManager pm = new PropertiesManager(); if (!pm.getAggregationDisabled()) { boolean languageNormalization = pm.getNormalizeEncoding(); Builder localSettingsEvent = ImmutableSettings.settingsBuilder(); localSettingsEvent.put("number_of_shards", 10).put("number_of_replicas", 2); localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); if (languageNormalization) { localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer", "icu_folding", "standard", "lowercase"); } else { localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); }/*from w ww. jav a2s .co m*/ Builder localSettingsGaz = ImmutableSettings.settingsBuilder(); localSettingsGaz.put("number_of_shards", 10).put("number_of_replicas", 2); localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); if (languageNormalization) { localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer", "icu_folding", "standard", "lowercase"); } else { localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); } //event feature String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); ElasticSearchManager eventIndex = IndexManager.createIndex( AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent); if (null == eventIndex) { // (if has been previously referenced in this process space) eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_); } eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_); if (bDeleteEventFeature) { eventIndex.deleteMe(); eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent); } //entity feature String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz); if (null == entityIndex) { // (if has been previously referenced in this process space) entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_); } entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_); if (bDeleteEntityFeature) { entityIndex.deleteMe(); entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz); } } //DOCS - much more complicated than anything else boolean bPingMainIndexFailed = !ElasticSearchManager .pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); // (ie if main doc index doesn't exist then always rebuild all indexes) if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing... try { Thread.sleep(60000); } catch (Exception e) { } bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); } bRebuildDocsIndex |= bPingMainIndexFailed; // check the main index has the "collection" alias - if not then rebuild everything if (!bPingMainIndexFailed && (null == _aliasInfo)) { ElasticSearchManager docIndex = ElasticSearchManager .getIndex(DocumentPojoIndexMap.globalDocumentIndex_); ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster() .state(new ClusterStateRequest()).actionGet(); _aliasInfo = CrossVersionImmutableMapOfImmutableMaps .getAliases(clusterState.getState().getMetaData()); if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) { bRebuildDocsIndex = true; } } //TESTED createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs); createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs); // Some hardwired dummy communities createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin) createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user) // (create dummy index used to keep personal group aliases) if (bRebuildDocsIndex || bDeleteDocs) { // OK, going to have different shards for different communities: // Get a list of all the communities: BasicDBObject query = new BasicDBObject(); BasicDBObject fieldsToDrop = new BasicDBObject("members", 0); fieldsToDrop.put("communityAttributes", 0); fieldsToDrop.put("userAttributes", 0); DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop); List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts) int i = 0; System.out.println("Initializing " + dbc.size() + " indexes:"); for (int j = 0; j < 2; ++j) { for (DBObject dbotmp : tmparray) { if ((++i % 100) == 0) { System.out.println("Initialized " + i + " indexes."); } BasicDBObject dbo = (BasicDBObject) dbotmp; // OK, going to see if there are any sources with this group id, create a new index if so: // (Don't use CommunityPojo data model here for performance reasons.... // (Also, haven't gotten round to porting CommunityPojo field access to using static fields)) ObjectId communityId = (ObjectId) dbo.get("_id"); boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false); boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false); ObjectId parentCommunityId = (ObjectId) dbo.get("parentId"); createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup, bSystemGroup, bDeleteDocs, j == 0); } //end loop over communities } // end loop over communities - first time parents only } // (end if need to do big loop over all sources) } catch (Exception e) { //DEBUG //e.printStackTrace(); throw new RuntimeException(e.getMessage()); } }
From source file:com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager.java
License:Open Source License
/** * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion) * @param col//from w w w . j a va2 s. c o m * @param doc - needs url, sourceKey set * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance * * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS * removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE] * MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID * processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, * DON'T have _id, BUT do have updateId and index (correct except in many geo cases)] * pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX * updateHarvestStatus(...) */ private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields, boolean bDeleteContent) { // 1] Create the query to soft delete the document BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, doc.getUrl()); query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm(doc.getSourceKey())); // 2] Delete the content if needed if (bDeleteContent) { if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { if (!_diagnosticMode) { DbManager.getDocument().getContent().remove(query); } else { System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: " + doc.getSourceKey() + "/" + doc.getUrl()); } } } //TESTED // 3] Work out which fields we have and which (if any we need to go and fetch): boolean needToFindAndModify = false; if (null == doc.getId()) { // This is called from processDocuments if (null != doc.getUpdateId()) { // update case... doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good) // (doc.index is populated but may not be correct because of the "many geos" workaround): if (DocumentPojoIndexMap.hasManyGeos(doc)) { doc.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_); // (note this check isn't stateless, it actually populates "locs" at the same time // this is handled in addToDatastore (update case), temp removed when adding to DB } //TESTED (2.1.2, diagnostic mode, doc2) } else { // Not an update case, we're going to have to grab the document after all, which is a bit slower needToFindAndModify = true; } } //TESTED (2.1.2, diagnostic mode, doc2) if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc) doc.setCreated(new Date(doc.getUpdateId().getTime())); } //TESTED (2.1.2, diagnostic mode, doc2) } // (if we're here and index is not set, then it is intended to be null) // 4] Update the doc_metadata collection BasicDBObject softDelete = getSoftDeleteUpdate(); BasicDBObject deadDoc = null; // (not normally needed) if (needToFindAndModify) { // less pleasant, need to go grab the doc deadDoc = (BasicDBObject) col.findOne(query, fields); } //TESTED (2.1.2) if (!_diagnosticMode) { col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons) } //TESTED (2.1.2) // 5] Add fields if necessary if (null != deadDoc) { doc.setCreated((Date) deadDoc.get(DocumentPojo.created_)); // (if getting this doc anyway then might as well get the created) doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_)); doc.setIndex((String) deadDoc.get(DocumentPojo.index_)); if (_diagnosticMode) { System.out .println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString()); } } //TESTED (2.1.2) else if (_diagnosticMode) { if (!needToFindAndModify) { System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted " + doc.toDb().toString()); } else { System.out.println( "StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString()); } } //TESTED (2.1.2) }
From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection eventFeatureDB = DbManager.getFeature().getAssociation(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex("association_index"); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping, localSettings);//ww w .ja v a 2 s. c o m // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex("association_index"); } // Now query the DB: DBCursor dbc = null; dbc = eventFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>(); int nSynced = 0; // Loop over array and invoke the cleansing function for each one while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class); // If this table has just been rebuilt from the document then the indexes are all wrong ... // recalculate and save if ('#' == evt.getIndex().charAt(0)) { AssociationPojo singleEvt = new AssociationPojo(); singleEvt.setEntity1_index(evt.getEntity1_index()); singleEvt.setEntity2_index(evt.getEntity2_index()); singleEvt.setVerb_category(evt.getVerb_category()); singleEvt.setGeo_index(evt.getGeo_index()); evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt)); eventFeatureDB .update(new BasicDBObject("_id", dbo.get("_id")), new BasicDBObject(MongoDbManager.set_, new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())), false, true); // (has to be a multi-update even though it's unique because it's sharded on index) } // Handle groups (system group is: "4c927585d591d31d7b37097a") if (null == evt.getCommunityId()) { evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } // Bulk add prep events.add(evt); nSynced++; if (events.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); events.clear(); } } // End loop over entities //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException { PropertiesManager pm = new PropertiesManager(); int nMaxContentSize_bytes = pm.getMaxContentSize(); // Initialize the DB: DBCollection docsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); ElasticSearchManager.setDefaultClusterName("infinite-aws"); // 1. Get the documents from the DB (combining data + metadata and refreshing source meta) // (Ignore soft-deleted records:) if (null == query) { query = new BasicDBObject(); }/*from ww w . j a v a 2 s. c o m*/ Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException( "Can't specify sourceKey as part of complex query term: " + query.toString()); } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }") if (sourceKeyQueryTerm instanceof String) { query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm)); } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }") else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }") else { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //(actually not possible, just included here for mathematical completeness...) } else { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }") // Optimize communityId into sourceKeys... if (null != query.get(DocumentPojo.communityId_)) { try { ObjectId commId = query.getObjectId(DocumentPojo.communityId_); BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); int added = 0; for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); added += sourceKeysForSource.size(); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); System.out.println("(Optimized simple community query to " + added + " source key(s))"); } catch (Exception e) { //DEBUG //e.printStackTrace(); System.out.println("(Can't optimize complex community query: " + e.getMessage()); } } //TESTED (by hand - including distributed source version) } // Ignored delete objects Object urlQuery = query.get(DocumentPojo.url_); if (null == urlQuery) { query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?) } //TESTED else if (urlQuery instanceof BasicDBObject) { ((BasicDBObject) urlQuery).append("$regex", "^[^?]"); } //TESTED //DEBUG //System.out.println("COMBINED QUERY= " + query.toString()); // If aggregating, kick off the background aggregation thread if (bAggregate) { EntityBackgroundAggregationManager.startThread(); AssociationBackgroundAggregationManager.startThread(); } //Debug: DBCursor dbc = null; dbc = docsDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } byte[] storageArray = new byte[200000]; int nSynced = 0; LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>(); Map<ObjectId, LinkedList<DocumentPojo>> communityList = null; ObjectId currCommunityId = null; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); String sDocIndex = doc.getIndex(); if (null == sDocIndex) { sDocIndex = "document_index"; } if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) { _deletedIndex.add(sDocIndex); rebuildIndex(sDocIndex); try { // (Just in case the index requires some time to sort itself out) Thread.sleep(1000); } catch (InterruptedException e) { } } //Debug: //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl()); // Get the content: if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); fields.put(CompressedFullTextPojo.sourceKey_, 1); DBCursor dbcGzip = contentDB.find(contentQ, fields); while (dbcGzip.hasNext()) { BasicDBObject dboContent = (BasicDBObject) dbcGzip.next(); if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) { // If this has another version then ignore this one... if (dbc.hasNext()) { continue; } //TESTED (by hand) } byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } doc.setFullText(output.toString()); } } // (else document has full text already) // Get tags, if necessary: // Always overwrite tags - one of the reasons we might choose to migrate // Also may need source in order to support source index filtering SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?) BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey())); if (null != srcDbo) { src = SourcePojo.fromDb(srcDbo, SourcePojo.class); if (null != src.getProcessingPipeline()) { try { // Set the index settings HarvestController hc = new HarvestController(); HarvestControllerPipeline hcPipe = new HarvestControllerPipeline(); hcPipe.extractSource_preProcessingPipeline(src, hc); } catch (Exception e) { //DEBUG e.printStackTrace(); } } //TESTED (by hand) _sourceCache.put(doc.getSourceKey(), src); } } doc.setTempSource(src); // (needed for source index filtering) if (null != src) { if (null != src.getTags()) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } // May also want to write this back to the DB: //TODO (INF-2223): Handle append tags or not in the pipeline... if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) { if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) { BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getRawSourceKey()); // (ie including the # if there is one) updateQuery.put(DocumentPojo._id_, doc.getId()); docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied)))); } doc.setTags(tagsTidied); // (just copy ptr across) } } } // 2. Update the index with the new document // (Optionally also update entity and assoc features) if (bAggregate) { if (null == currCommunityId) { currCommunityId = doc.getCommunityId(); } else if (!currCommunityId.equals(doc.getCommunityId())) { LinkedList<DocumentPojo> perCommunityDocList = null; if (null == communityList) { // (very first time we see > 1 community) communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>(); perCommunityDocList = new LinkedList<DocumentPojo>(); perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet) communityList.put(currCommunityId, perCommunityDocList); } currCommunityId = doc.getCommunityId(); perCommunityDocList = communityList.get(currCommunityId); if (null == perCommunityDocList) { perCommunityDocList = new LinkedList<DocumentPojo>(); communityList.put(currCommunityId, perCommunityDocList); } perCommunityDocList.add(doc); } } //TESTED nSynced++; docsToTransfer.add(doc); if (0 == (nSynced % 10000)) { StoreAndIndexManager manager = new StoreAndIndexManager(); if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); communityList = null; // (in case the next 10,000 docs are all in the same community!) currCommunityId = null; } //TOTEST manager.addToSearch(docsToTransfer); docsToTransfer.clear(); System.out.println("(Synced " + nSynced + " records)"); } } // (End loop over docs) // Sync remaining docs if (!docsToTransfer.isEmpty()) { if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); } StoreAndIndexManager manager = new StoreAndIndexManager(); manager.addToSearch(docsToTransfer); } if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } if (bAggregate) { System.out.println("Completed. You can hit CTRL+C at any time."); System.out.println( "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities."); try { Thread.sleep(300000); } catch (InterruptedException e) { } // Turn off so we can exit EntityBackgroundAggregationManager.stopThreadAndWait(); AssociationBackgroundAggregationManager.stopThreadAndWait(); } }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
@SuppressWarnings("unused") private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort, BasicDBObject query, int nLimit) { ElasticSearchManager elasticManager = null; try {/*from ww w . jav a2 s. co m*/ // Initialize the DB: DBCollection feedsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); String indexName = "document_index"; // Test/debug recreate the index if (true) { // (delete the index) System.out.println("Deleting index..."); elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); elasticManager.deleteMe(); //(also deletes the child index - same index, different type) // Create the index if necessary String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 10).put("number_of_replicas", 2); System.out.println("Creating index..." + sMapping); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, sElasticHost + ":" + sElasticPort, sMapping, localSettings); } // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); } // Get the feeds from the DB: //Debug: // System.out.println("Querying DB..."); DBCursor dbc = feedsDB.find(query).limit(nLimit); byte[] storageArray = new byte[200000]; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); //Debug: System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl()); // Get the content: BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get("gzip_content")); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = gzip.read(storageArray, 0, 200000); String s = new String(storageArray, 0, nRead, "UTF-8"); doc.setFullText(s); } // Get tag: SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject("key", doc.getSourceKey())); if (null != srcDbo) { src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class); _sourceCache.put(doc.getSourceKey(), src); } } if (null != src) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } doc.setTags(tagsTidied); } //TEST: set dynamic field // Lots of testing of dynamic dates: // feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString())); // String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated()); // feed.addToMetadata("another_dateISO", s1); // String s1_5 = new SimpleDateFormat().format(feed.getCreated()); // feed.addToMetadata("another_dateTimeJava", s1_5); // String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated()); // feed.addToMetadata("another_dateYYYYMMDD", s2); // String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated()); // feed.addToMetadata("another_dateRFC822", s3); // feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString()); // // Testing of the string field types // feed.addToMetadata("my_comment", "Testing this ABCDEFG"); // feed.addToMetadata("my_term", "Testing this UVWXYZ"); // feed.addToMetadata("my_text", "Testing this 123456"); // // Test an array of longs: // Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L; // feed.addToMetadata("md_long", tl); //TEST: some dummy event timestamp adding code (not seeing much/any in the data) // if (null != feed.getEvents()) { // int i = 0; // for (EventPojo evt: feed.getEvents()) { // //1: Add single date // if (0 == i) { // evt.time_start = "2011-01-01"; // } // //2: Add short span // if (1 == i) { // evt.time_start = "2010-04-06"; // evt.time_end = "2010-08-09"; // } // //3: Add cross-yr span // if (2 == i) { // evt.time_start = "2012-06-05"; // evt.time_end = "2013-09-05"; // } // //4: Add too long span // if (3 == i) { // evt.time_start = "2012-04-06"; // evt.time_end = "2014-04-09"; // } // i++; // } // } // For event adding, see data_model.test.TestCode } } catch (IOException e) { e.printStackTrace(); } finally { //nothing to do } }
From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index String indexName = "entity_index"; ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex(indexName); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings); // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName); }//from w w w. j av a2 s. co m // Now query the DB: DBCursor dbc = null; dbc = entityFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } int nSynced = 0; List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>(); while (dbc.hasNext()) { EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class); if (null != feature.getAlias()) { // (some corrupt gazateer entry) // Handle groups (system group is: "4c927585d591d31d7b37097a") // if there is no community id, add system group (something is wrong if this happens?) if (null == feature.getCommunityId()) { feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } } entities.add(feature); nSynced++; // Add the entities if (entities.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) entities = new ArrayList<EntityFeaturePojo>(); } } //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }