List of usage examples for com.mongodb BasicDBObject containsField
public boolean containsField(final String field)
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable, String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer, String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue, String arguments) throws IOException { String dbserver = prop_general.getDatabaseServer(); output = outputDatabase + "." + tempOutputCollection; int nSplits = 8; int nDocsPerSplit = 12500; //add communities to query if this is not a custom table if (!isCustomTable) { // Start with the old query: BasicDBObject oldQueryObj = null; if (query.startsWith("{")) { oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query); } else {/*from w w w . java2 s . c o m*/ oldQueryObj = new BasicDBObject(); } // Community Ids aren't indexed in the metadata collection, but source keys are, so we need to transform to that BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); boolean bAdminOverride = false; if (oldQueryObj.containsField("admin")) { // For testing only... if (1 == communityIds.size()) { ObjectId communityId = communityIds.get(0); if (RESTTools.adminLookup(communityId.toString())) { bAdminOverride = true; if (oldQueryObj.containsField("max.splits")) { nSplits = oldQueryObj.getInt("max.splits"); } if (oldQueryObj.containsField("max.docs.per.split")) { nDocsPerSplit = oldQueryObj.getInt("max.docs.per.split"); } } } } //(end diagnostic/benchmarking/test code for admins only part 1) if (bAdminOverride) { oldQueryObj = (BasicDBObject) oldQueryObj.get("admin"); //(end diagnostic/benchmarking/test code for admins only part 2) } else if (oldQueryObj.containsField(DocumentPojo.sourceKey_) || input.startsWith("feature.")) { // Source Key specified by user, stick communityIds check in for security oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); } else { // Source key not specified by user, transform communities->sourcekeys BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields); if (dbc.count() > 500) { // (too many source keys let's keep the query size sensible...) oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); } else { HashSet<String> sourceKeys = new HashSet<String>(); while (dbc.hasNext()) { DBObject dbo = dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { sourceKeys.add(sourceKey); } } if (sourceKeys.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } BasicDBObject newQueryClauseObj = new BasicDBObject(DbManager.in_, sourceKeys); // Now combine the queries... oldQueryObj.put(DocumentPojo.sourceKey_, newQueryClauseObj); } // (end if too many source keys across the communities) } //(end if need to break source keys down into communities) query = oldQueryObj.toString(); } else { //get the custom table (and database) input = getCustomDbAndCollection(input); } if (arguments == null) arguments = ""; // Generic configuration out.write("<?xml version=\"1.0\"?>\n<configuration>"); // Mongo specific configuration out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title + "</value></property>" + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>" + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>" + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://" + dbserver + "/" + input + "</value></property>" + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://" + dbserver + "/" + output + "</value> </property>" + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>" + query + "</value></property>" + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>" + ((fields == null) ? ("") : fields) + "</value></property>" + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>" + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>0</value><!-- 0 == no limit --></property>" + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>" + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper + "</value></property>" + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer + "</value></property>" + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>" + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.mongodb.hadoop.MongoOutputFormat</value></property>" + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>" + outputKey + "</value></property>" + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>" + outputValue + "</value></property>" + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value></value></property>" + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value></value></property>" + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>" + combiner + "</value></property>" + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>" + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>" + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>"); // Infinit.e specific configuration out.write("\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>" + StringEscapeUtils.escapeXml(arguments) + "</value></property>" + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>" + nSplits + "</value></property>" + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>" + nDocsPerSplit + "</value></property>"); // Closing thoughts: out.write("\n</configuration>"); out.flush(); out.close(); }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery, boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) { alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS); // (in standalone mode, never use chunks) MongoURI uri = conf.getInputURI();//from w w w . j a v a 2s.com DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri); if (conf.getLimit() > 0) { return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll); } else { if (!alwaysUseChunks) { int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); int count = 0; if (null == splitDocCount) { if (nMaxCount <= 1) { nMaxCount = 0; } else { //DEBUG //System.out.println(coll.find(confQuery).limit(1).explain()); count = (int) coll.getCount(confQuery, null, nMaxCount, 0); if (0 == count) { return new ArrayList<InputSplit>(); } } //TESTED } else { count = splitDocCount; } //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter //otherwise use the old splitter if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) { _logger.debug("Calculating splits manually"); int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1; return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll); } //TESTED } if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) { // OK if we're going to do the sharded version then we will want to calculate splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true) } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case) if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now: conf.setQuery(confQuery); } List<InputSplit> splits = MongoSplitter.calculateSplits(conf); // (unless manually set, like above, runs with the _original_ query) int initialSplitSize = splits.size(); // We have the MongoDB-calculated splits, now calculate their intersection vs the query @SuppressWarnings("rawtypes") Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>(); @SuppressWarnings("rawtypes") Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>(); BasicDBObject originalQuery = confQuery; ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size()); Iterator<InputSplit> splitIt = splits.iterator(); while (splitIt.hasNext()) { try { orderedArraySet_afterMin.clear(); MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next(); BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min"); BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max"); //DEBUG //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max); //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max); if (null != min) { // How does the min fit in with the general query try { if (compareFields(-1, originalQuery, min, max, orderedArraySet, orderedArraySet_afterMin) < 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED if (null != max) { // How does the min fit in with the general query try { if (compareFields(1, originalQuery, max, min, orderedArraySet, orderedArraySet_afterMin) > 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED //DEBUG //_logger.info("(retained split)"); //System.out.println("(retained split)"); // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper) //////////////////////////////// // Now some infinit.e specific processing... if (newShardScheme) { @SuppressWarnings("rawtypes") TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_); if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) { @SuppressWarnings("rawtypes") Comparable minSourceKey = null; Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_); if (minSourceKeyObj instanceof String) { minSourceKey = (String) minSourceKeyObj; } if (null == minSourceKey) { minSourceKey = sourceKeyOrderedArray.first(); } //TESTED @SuppressWarnings("rawtypes") Comparable maxSourceKey = null; Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_); if (maxSourceKeyObj instanceof String) { maxSourceKey = (String) maxSourceKeyObj; } if (null == maxSourceKey) { maxSourceKey = sourceKeyOrderedArray.last(); } //TESTED DBObject splitQuery = mongoSplit.getQuerySpec(); BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey); } //TESTED (array of sources, only one matches) else { // multiple matching source keys splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true))); } //TESTED (array of sources, multiple match) newsplits.add( new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout())); } else { // original query is of sufficient simplicity newsplits.add( new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout())); } //TESTED (no change to existing source) } //TESTED else { // old sharding scheme, remove min/max and replace with normal _id based query where possible DBObject splitQuery = mongoSplit.getQuerySpec(); // Step 1: create a query range for _id: BasicDBObject idRange = null; Object idMin = (min == null) ? null : min.get(DocumentPojo._id_); Object idMax = (max == null) ? null : max.get(DocumentPojo._id_); if (!(idMin instanceof ObjectId)) idMin = null; if (!(idMax instanceof ObjectId)) idMax = null; if ((null != idMin) || (null != idMax)) { idRange = new BasicDBObject(); if (null != idMin) { idRange.put(DbManager.gte_, idMin); } if (null != idMax) { idRange.put(DbManager.lt_, idMax); } } //TESTED // Step 2: merge with whatever we have at the moment: if (null != idRange) { BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_); boolean convertedAwayFromMinMax = false; if (null == idQueryElement) { // nice and easy, add _id range splitQueryQuery.put(DocumentPojo._id_, idRange); convertedAwayFromMinMax = true; } //TESTED else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy splitQueryQuery.remove(DocumentPojo._id_); splitQueryQuery.put(DbManager.and_, Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement), new BasicDBObject(DocumentPojo._id_, idRange))); convertedAwayFromMinMax = true; } //TESTED // (else stick with min/max) if (convertedAwayFromMinMax) { // can construct an _id query splitQuery.removeField("$min"); splitQuery.removeField("$max"); } //TESTED splitQuery.put("$query", splitQueryQuery); } newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout())); } //TESTED } catch (Exception e) { //DEBUG //e.printStackTrace(); } // do nothing must be some other type of input split } //TESTED //DEBUG //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size()); _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to " + newsplits.size()); return newsplits; } }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
private static boolean isQueryNonTrivial(BasicDBObject query) { if ((query.size() > 3) || ((query.size() > 2) && !query.containsField(DocumentPojo.sourceKey_))) { return true; }/*from w w w . j a v a2 s .co m*/ return false; }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner,/*w w w . ja va 2 s.c om*/ InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable, String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer, String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue, String arguments, Boolean incrementalMode, ObjectId userId, Boolean selfMerge, String originalOutputCollection, Boolean appendResults) throws IOException { String dbserver = prop_general.getDatabaseServer(); output = outputDatabase + "." + tempOutputCollection; boolean isAdmin = AuthUtils.isAdmin(userId); int nSplits = 8; int nDocsPerSplit = 12500; //add communities to query if this is not a custom table BasicDBObject oldQueryObj = null; BasicDBObject srcTags = null;//from w w w. j a v a 2 s . c om // Start with the old query: if (query.startsWith("{")) { oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query); } else { oldQueryObj = new BasicDBObject(); } boolean elasticsearchQuery = oldQueryObj.containsField("qt") && !isCustomTable; int nLimit = 0; if (oldQueryObj.containsField("$limit")) { nLimit = oldQueryObj.getInt("$limit"); oldQueryObj.remove("$limit"); } if (oldQueryObj.containsField("$splits")) { nSplits = oldQueryObj.getInt("$splits"); oldQueryObj.remove("$splits"); } if (oldQueryObj.containsField("$srctags")) { srcTags = new BasicDBObject(SourcePojo.tags_, oldQueryObj.get("$srctags")); oldQueryObj.remove("$srctags"); } if (bLocalMode) { // If in local mode, then set this to a large number so we always run inside our limit/split version // (since for some reason MongoInputFormat seems to fail on large collections) nSplits = InfiniteMongoSplitter.MAX_SPLITS; } if (oldQueryObj.containsField("$docsPerSplit")) { nDocsPerSplit = oldQueryObj.getInt("$docsPerSplit"); oldQueryObj.remove("$docsPerSplit"); } oldQueryObj.remove("$fields"); oldQueryObj.remove("$output"); oldQueryObj.remove("$reducers"); String mapperKeyClass = oldQueryObj.getString("$mapper_key_class", ""); String mapperValueClass = oldQueryObj.getString("$mapper_value_class", ""); oldQueryObj.remove("$mapper_key_class"); oldQueryObj.remove("$mapper_value_class"); String cacheList = null; Object cacheObj = oldQueryObj.get("$caches"); if (null != cacheObj) { cacheList = cacheObj.toString(); // (either array of strings, or single string) if (!cacheList.startsWith("[")) { cacheList = "[" + cacheList + "]"; // ("must" now be valid array) } oldQueryObj.remove("$caches"); } //TESTED if (null != nDebugLimit) { // (debug mode override) nLimit = nDebugLimit; } boolean tmpIncMode = (null != incrementalMode) && incrementalMode; Date fromOverride = null; Date toOverride = null; Object fromOverrideObj = oldQueryObj.remove("$tmin"); Object toOverrideObj = oldQueryObj.remove("$tmax"); if (null != fromOverrideObj) { fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true); } if (null != toOverrideObj) { toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false); } if (!isCustomTable) { if (elasticsearchQuery) { oldQueryObj.put("communityIds", communityIds); //tmin/tmax not supported - already have that capability as part of the query } else { if (input.equals("feature.temporal")) { if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("value.maxTime", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, true)); } //TESTED oldQueryObj.put("_id.c", new BasicDBObject(DbManager.in_, communityIds)); } else { oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false)); } //TESTED if (input.equals("doc_metadata.metadata")) { oldQueryObj.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (ensures not soft-deleted) } } } } else { if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false)); } //TESTED //get the custom table (and database) input = CustomOutputManager.getCustomDbAndCollection(input); } query = oldQueryObj.toString(); if (arguments == null) arguments = ""; // Generic configuration out.write("<?xml version=\"1.0\"?>\n<configuration>"); // Mongo specific configuration out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title + "</value></property>" + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>" + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>" + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://" + dbserver + "/" + input + "</value></property>" + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://" + dbserver + "/" + output + "</value> </property>" + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>" + StringEscapeUtils.escapeXml(query) + "</value></property>" + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>" + ((fields == null) ? ("") : fields) + "</value></property>" + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>" + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>" + nLimit + "</value><!-- 0 == no limit --></property>" + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>" + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper + "</value></property>" + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer + "</value></property>" + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>" + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat</value></property>" + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>" + outputKey + "</value></property>" + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>" + outputValue + "</value></property>" + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value>" + mapperKeyClass + "</value></property>" + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value>" + mapperValueClass + "</value></property>" + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>" + combiner + "</value></property>" + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>" + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>" + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>"); // Infinit.e specific configuration out.write("\n\t<property><!-- User Arguments [optional] --><name>infinit.e.userid</name><value>" + StringEscapeUtils.escapeXml(userId.toString()) + "</value></property>" + "\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>" + StringEscapeUtils.escapeXml(arguments) + "</value></property>" + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>" + nSplits + "</value></property>" + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>" + nDocsPerSplit + "</value></property>" + "\n\t<property><!-- Infinit.e incremental mode [optional] --><name>update.incremental</name><value>" + tmpIncMode + "</value></property>" + "\n\t<property><!-- Infinit.e quick admin check [optional] --><name>infinit.e.is.admin</name><value>" + isAdmin + "</value></property>" + "\n\t<property><!-- Infinit.e userid [optional] --><name>infinit.e.userid</name><value>" + userId + "</value></property>"); if (null != cacheList) { out.write( "\n\t<property><!-- Infinit.e cache list [optional] --><name>infinit.e.cache.list</name><value>" + cacheList + "</value></property>"); } //TESTED if (null != srcTags) { out.write( "\n\t<property><!-- Infinit.e src tags filter [optional] --><name>infinit.e.source.tags.filter</name><value>" + srcTags.toString() + "</value></property>"); } if (null != selfMerge && selfMerge && originalOutputCollection != null) { originalOutputCollection = "mongodb://" + dbserver + "/" + outputDatabase + "." + originalOutputCollection; out.write( "\n\t<property><!-- This jobs output collection for passing into the mapper along with input collection [optional] --><name>infinit.e.selfMerge</name><value>" + originalOutputCollection + "</value></property>"); } // Closing thoughts: out.write("\n</configuration>"); out.flush(); out.close(); }
From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection eventFeatureDB = DbManager.getFeature().getAssociation(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex("association_index"); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping, localSettings);/*from w w w.ja v a 2s. c om*/ // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex("association_index"); } // Now query the DB: DBCursor dbc = null; dbc = eventFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>(); int nSynced = 0; // Loop over array and invoke the cleansing function for each one while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class); // If this table has just been rebuilt from the document then the indexes are all wrong ... // recalculate and save if ('#' == evt.getIndex().charAt(0)) { AssociationPojo singleEvt = new AssociationPojo(); singleEvt.setEntity1_index(evt.getEntity1_index()); singleEvt.setEntity2_index(evt.getEntity2_index()); singleEvt.setVerb_category(evt.getVerb_category()); singleEvt.setGeo_index(evt.getGeo_index()); evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt)); eventFeatureDB .update(new BasicDBObject("_id", dbo.get("_id")), new BasicDBObject(MongoDbManager.set_, new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())), false, true); // (has to be a multi-update even though it's unique because it's sharded on index) } // Handle groups (system group is: "4c927585d591d31d7b37097a") if (null == evt.getCommunityId()) { evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } // Bulk add prep events.add(evt); nSynced++; if (events.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); events.clear(); } } // End loop over entities //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException { PropertiesManager pm = new PropertiesManager(); int nMaxContentSize_bytes = pm.getMaxContentSize(); // Initialize the DB: DBCollection docsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); ElasticSearchManager.setDefaultClusterName("infinite-aws"); // 1. Get the documents from the DB (combining data + metadata and refreshing source meta) // (Ignore soft-deleted records:) if (null == query) { query = new BasicDBObject(); }//from ww w . j a va 2 s .c om Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException( "Can't specify sourceKey as part of complex query term: " + query.toString()); } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }") if (sourceKeyQueryTerm instanceof String) { query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm)); } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }") else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }") else { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //(actually not possible, just included here for mathematical completeness...) } else { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }") // Optimize communityId into sourceKeys... if (null != query.get(DocumentPojo.communityId_)) { try { ObjectId commId = query.getObjectId(DocumentPojo.communityId_); BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); int added = 0; for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); added += sourceKeysForSource.size(); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); System.out.println("(Optimized simple community query to " + added + " source key(s))"); } catch (Exception e) { //DEBUG //e.printStackTrace(); System.out.println("(Can't optimize complex community query: " + e.getMessage()); } } //TESTED (by hand - including distributed source version) } // Ignored delete objects Object urlQuery = query.get(DocumentPojo.url_); if (null == urlQuery) { query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?) } //TESTED else if (urlQuery instanceof BasicDBObject) { ((BasicDBObject) urlQuery).append("$regex", "^[^?]"); } //TESTED //DEBUG //System.out.println("COMBINED QUERY= " + query.toString()); // If aggregating, kick off the background aggregation thread if (bAggregate) { EntityBackgroundAggregationManager.startThread(); AssociationBackgroundAggregationManager.startThread(); } //Debug: DBCursor dbc = null; dbc = docsDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } byte[] storageArray = new byte[200000]; int nSynced = 0; LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>(); Map<ObjectId, LinkedList<DocumentPojo>> communityList = null; ObjectId currCommunityId = null; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); String sDocIndex = doc.getIndex(); if (null == sDocIndex) { sDocIndex = "document_index"; } if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) { _deletedIndex.add(sDocIndex); rebuildIndex(sDocIndex); try { // (Just in case the index requires some time to sort itself out) Thread.sleep(1000); } catch (InterruptedException e) { } } //Debug: //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl()); // Get the content: if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); fields.put(CompressedFullTextPojo.sourceKey_, 1); DBCursor dbcGzip = contentDB.find(contentQ, fields); while (dbcGzip.hasNext()) { BasicDBObject dboContent = (BasicDBObject) dbcGzip.next(); if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) { // If this has another version then ignore this one... if (dbc.hasNext()) { continue; } //TESTED (by hand) } byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } doc.setFullText(output.toString()); } } // (else document has full text already) // Get tags, if necessary: // Always overwrite tags - one of the reasons we might choose to migrate // Also may need source in order to support source index filtering SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?) BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey())); if (null != srcDbo) { src = SourcePojo.fromDb(srcDbo, SourcePojo.class); if (null != src.getProcessingPipeline()) { try { // Set the index settings HarvestController hc = new HarvestController(); HarvestControllerPipeline hcPipe = new HarvestControllerPipeline(); hcPipe.extractSource_preProcessingPipeline(src, hc); } catch (Exception e) { //DEBUG e.printStackTrace(); } } //TESTED (by hand) _sourceCache.put(doc.getSourceKey(), src); } } doc.setTempSource(src); // (needed for source index filtering) if (null != src) { if (null != src.getTags()) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } // May also want to write this back to the DB: //TODO (INF-2223): Handle append tags or not in the pipeline... if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) { if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) { BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getRawSourceKey()); // (ie including the # if there is one) updateQuery.put(DocumentPojo._id_, doc.getId()); docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied)))); } doc.setTags(tagsTidied); // (just copy ptr across) } } } // 2. Update the index with the new document // (Optionally also update entity and assoc features) if (bAggregate) { if (null == currCommunityId) { currCommunityId = doc.getCommunityId(); } else if (!currCommunityId.equals(doc.getCommunityId())) { LinkedList<DocumentPojo> perCommunityDocList = null; if (null == communityList) { // (very first time we see > 1 community) communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>(); perCommunityDocList = new LinkedList<DocumentPojo>(); perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet) communityList.put(currCommunityId, perCommunityDocList); } currCommunityId = doc.getCommunityId(); perCommunityDocList = communityList.get(currCommunityId); if (null == perCommunityDocList) { perCommunityDocList = new LinkedList<DocumentPojo>(); communityList.put(currCommunityId, perCommunityDocList); } perCommunityDocList.add(doc); } } //TESTED nSynced++; docsToTransfer.add(doc); if (0 == (nSynced % 10000)) { StoreAndIndexManager manager = new StoreAndIndexManager(); if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); communityList = null; // (in case the next 10,000 docs are all in the same community!) currCommunityId = null; } //TOTEST manager.addToSearch(docsToTransfer); docsToTransfer.clear(); System.out.println("(Synced " + nSynced + " records)"); } } // (End loop over docs) // Sync remaining docs if (!docsToTransfer.isEmpty()) { if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); } StoreAndIndexManager manager = new StoreAndIndexManager(); manager.addToSearch(docsToTransfer); } if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } if (bAggregate) { System.out.println("Completed. You can hit CTRL+C at any time."); System.out.println( "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities."); try { Thread.sleep(300000); } catch (InterruptedException e) { } // Turn off so we can exit EntityBackgroundAggregationManager.stopThreadAndWait(); AssociationBackgroundAggregationManager.stopThreadAndWait(); } }
From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index String indexName = "entity_index"; ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex(indexName); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings); // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName); }//w ww . j a v a 2 s. co m // Now query the DB: DBCursor dbc = null; dbc = entityFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } int nSynced = 0; List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>(); while (dbc.hasNext()) { EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class); if (null != feature.getAlias()) { // (some corrupt gazateer entry) // Handle groups (system group is: "4c927585d591d31d7b37097a") // if there is no community id, add system group (something is wrong if this happens?) if (null == feature.getCommunityId()) { feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } } entities.add(feature); nSynced++; // Add the entities if (entities.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) entities = new ArrayList<EntityFeaturePojo>(); } } //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }
From source file:com.images3.data.impl.MongoDBObjectMapper.java
License:Apache License
public ImageMetricsOS mapToImageMetricsOS(BasicDBObject source) { Map<ImageMetricsType, Long> numbers = new HashMap<ImageMetricsType, Long>(); numbers.put(ImageMetricsType.COUNTS_INBOUND, (source.containsField(ImageMetricsType.COUNTS_INBOUND.toString()) ? source.getLong(ImageMetricsType.COUNTS_INBOUND.toString()) : 0L));/*from ww w. j a v a 2 s .co m*/ numbers.put(ImageMetricsType.COUNTS_OUTBOUND, (source.containsField(ImageMetricsType.COUNTS_OUTBOUND.toString()) ? source.getLong(ImageMetricsType.COUNTS_OUTBOUND.toString()) : 0L)); numbers.put(ImageMetricsType.SIZE_INBOUND, (source.containsField(ImageMetricsType.SIZE_INBOUND.toString()) ? source.getLong(ImageMetricsType.SIZE_INBOUND.toString()) : 0L)); numbers.put(ImageMetricsType.SIZE_OUTBOUND, (source.containsField(ImageMetricsType.SIZE_OUTBOUND.toString()) ? source.getLong(ImageMetricsType.SIZE_OUTBOUND.toString()) : 0L)); return new ImageMetricsOS(source.getString("imagePlantId"), source.getString("templateName"), source.getLong("second"), numbers); }
From source file:com.impetus.client.mongodb.query.MongoDBQuery.java
License:Apache License
/** * Creates MongoDB Query object from filterClauseQueue. * // w w w.ja va 2 s . c o m * @param m * the m * @param filterClauseQueue * the filter clause queue * @return the basic db object */ public BasicDBObject createSubMongoQuery(EntityMetadata m, Queue filterClauseQueue) { BasicDBObject query = new BasicDBObject(); BasicDBObject compositeColumns = new BasicDBObject(); MetamodelImpl metaModel = (MetamodelImpl) kunderaMetadata.getApplicationMetadata() .getMetamodel(m.getPersistenceUnit()); AbstractManagedType managedType = (AbstractManagedType) metaModel.entity(m.getEntityClazz()); for (Object object : filterClauseQueue) { boolean isCompositeColumn = false; boolean isSubCondition = false; if (object instanceof FilterClause) { FilterClause filter = (FilterClause) object; String property = filter.getProperty(); String condition = filter.getCondition(); Object value = filter.getValue().get(0); // value is string but field.getType is different, then get // value using Field f = null; // if alias is still present .. means it is an enclosing // document search. if (managedType.hasLobAttribute()) { EntityType entity = metaModel.entity(m.getEntityClazz()); String fieldName = m.getFieldName(property); f = (Field) entity.getAttribute(fieldName).getJavaMember(); if (value.getClass().isAssignableFrom(String.class) && f != null && !f.getType().equals(value.getClass())) { value = PropertyAccessorFactory.getPropertyAccessor(f).fromString(f.getType().getClass(), value.toString()); } value = MongoDBUtils.populateValue(value, value.getClass()); property = "metadata." + property; } else { if (((AbstractAttribute) m.getIdAttribute()).getJPAColumnName().equalsIgnoreCase(property)) { property = "_id"; f = (Field) m.getIdAttribute().getJavaMember(); if (metaModel.isEmbeddable(m.getIdAttribute().getBindableJavaType()) && value.getClass().isAssignableFrom(f.getType())) { EmbeddableType compoundKey = metaModel .embeddable(m.getIdAttribute().getBindableJavaType()); compositeColumns = MongoDBUtils.getCompoundKeyColumns(m, value, compoundKey); isCompositeColumn = true; continue; } } else if (metaModel.isEmbeddable(m.getIdAttribute().getBindableJavaType()) && StringUtils.contains(property, '.')) { // Means it is a case of composite column. property = property.substring(property.indexOf(".") + 1); isCompositeColumn = true; } /* * if a composite key. "." assuming "." is part of * property in case of embeddable only */ else if (StringUtils.contains(property, '.')) { EntityType entity = metaModel.entity(m.getEntityClazz()); StringTokenizer tokenizer = new StringTokenizer(property, "."); String embeddedAttributeAsStr = tokenizer.nextToken(); String embeddableAttributeAsStr = tokenizer.nextToken(); Attribute embeddedAttribute = entity.getAttribute(embeddedAttributeAsStr); EmbeddableType embeddableEntity = metaModel .embeddable(((AbstractAttribute) embeddedAttribute).getBindableJavaType()); f = (Field) embeddableEntity.getAttribute(embeddableAttributeAsStr).getJavaMember(); property = ((AbstractAttribute) embeddedAttribute).getJPAColumnName() + "." + ((AbstractAttribute) embeddableEntity.getAttribute(embeddableAttributeAsStr)) .getJPAColumnName(); } else { EntityType entity = metaModel.entity(m.getEntityClazz()); String discriminatorColumn = ((AbstractManagedType) entity).getDiscriminatorColumn(); if (!property.equals(discriminatorColumn)) { String fieldName = m.getFieldName(property); f = (Field) entity.getAttribute(fieldName).getJavaMember(); } } if (value.getClass().isAssignableFrom(String.class) && f != null && !f.getType().equals(value.getClass())) { value = PropertyAccessorFactory.getPropertyAccessor(f).fromString(f.getType().getClass(), value.toString()); } value = MongoDBUtils.populateValue(value, value.getClass()); } // Property, if doesn't exist in entity, may be there in a // document embedded within it, so we have to check that // TODO: Query should actually be in a format // documentName.embeddedDocumentName.column, remove below if // block once this is decided // Query could be geospatial in nature if (f != null && f.getType().equals(Point.class)) { GeospatialQuery geospatialQueryimpl = GeospatialQueryFactory .getGeospatialQueryImplementor(condition, value); query = (BasicDBObject) geospatialQueryimpl.createGeospatialQuery(property, value, query); } else { if (isCompositeColumn) { EmbeddableType embeddableType = metaModel .embeddable(m.getIdAttribute().getBindableJavaType()); AbstractAttribute attribute = (AbstractAttribute) embeddableType.getAttribute(property); property = new StringBuffer("_id.").append(attribute.getJPAColumnName()).toString(); } if (condition.equals("=")) { query.append(property, value); } else if (condition.equalsIgnoreCase("like")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$regex", createLikeRegex((String) value))); } else { query.append(property, new BasicDBObject("$regex", createLikeRegex((String) value))); } } else if (condition.equalsIgnoreCase(">")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$gt", value)); } else { query.append(property, new BasicDBObject("$gt", value)); } } else if (condition.equalsIgnoreCase(">=")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$gte", value)); } else { query.append(property, new BasicDBObject("$gte", value)); } } else if (condition.equalsIgnoreCase("<")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$lt", value)); } else { query.append(property, new BasicDBObject("$lt", value)); } } else if (condition.equalsIgnoreCase("<=")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$lte", value)); } else { query.append(property, new BasicDBObject("$lte", value)); } } else if (condition.equalsIgnoreCase("in")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$in", filter.getValue())); } else { query.append(property, new BasicDBObject("$in", filter.getValue())); } } else if (condition.equalsIgnoreCase("not in")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$nin", filter.getValue())); } else { query.append(property, new BasicDBObject("$nin", filter.getValue())); } } else if (condition.equalsIgnoreCase("<>")) { if (query.containsField(property)) { query.get(property); query.put(property, ((BasicDBObject) query.get(property)).append("$ne", value)); } else { query.append(property, new BasicDBObject("$ne", value)); } } } // TODO: Add support for other operators like >, <, >=, <=, // order by asc/ desc, limit, skip, count etc } } if (!compositeColumns.isEmpty()) { query.append("_id", compositeColumns); } return query; }