List of usage examples for com.mongodb BasicDBObject getInt
public int getInt(final String key, final int def)
From source file:com.avanza.ymer.MirroredObject.java
License:Apache License
int getDocumentVersion(BasicDBObject dbObject) { return dbObject.getInt(DOCUMENT_FORMAT_VERSION_PROPERTY, 1); }
From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_Associations.java
License:Open Source License
static void addStandaloneEvents(BasicDBObject doc, double dDocSig, int nPhase, StandaloneEventHashAggregator standaloneEventAggregator, boolean bEntTypeFilterPositive, boolean bAssocVerbFilterPositive, HashSet<String> entTypeFilter, HashSet<String> assocVerbFilter, boolean bEvents, boolean bSummaries, boolean bFacts) { if (standaloneEventAggregator.bSimulateAggregation) { bSummaries = false;/*from ww w .j a v a 2 s .co m*/ } String sDocIsoPubDate = null; BasicDBList lev = (BasicDBList) (doc.get(DocumentPojo.associations_)); if (null != lev) { for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); String sEvType = e.getString(AssociationPojo.assoc_type_); boolean bIsFact = false; boolean bIsSummary = false; boolean bKeep = true; if (null == sEvType) { bKeep = false; } else if (sEvType.equalsIgnoreCase("event")) { if (!bEvents) bKeep = false; } else if (sEvType.equalsIgnoreCase("fact")) { if (!bFacts) bKeep = false; bIsFact = true; } else if (sEvType.equalsIgnoreCase("summary")) { if (!bSummaries) bKeep = false; bIsSummary = true; } //TESTED x4 // Filter and aliasing logic: if (bKeep) { boolean bKeep2 = filterAndAliasAssociation(e, standaloneEventAggregator.aliasLookup, true, bEntTypeFilterPositive, bAssocVerbFilterPositive, entTypeFilter, assocVerbFilter); if (!bKeep2) { e0.remove(); // (remove/rename events based on filters where we can, // means we don't have to do it in stage4) bKeep = false; } } //TESTED if (bKeep) { String time_start = null; String time_end = null; // (normally not needed) if (!standaloneEventAggregator.bSimulateAggregation) { //else times are discarded // Add time from document time_start = e.getString(AssociationPojo.time_start_); if (null == time_start) { if (null == sDocIsoPubDate) { // Convert docu pub date to ISO (day granularity): Date pubDate = (Date) doc.get(DocumentPojo.publishedDate_); if (null != pubDate) { SimpleDateFormat f2 = new SimpleDateFormat("yyyy-MM-dd"); time_start = f2.format(pubDate); } } else { time_start = sDocIsoPubDate; // (so it doesn't get added again below) } } //TESTED else { // Remove hourly granularity for consistency time_start = time_start.replaceAll("T.*$", ""); time_end = e.getString(AssociationPojo.time_end_); if (null != time_end) { time_end = time_end.replaceAll("T.*$", ""); } } //TESTED (with debug code, eg time_start = "1997-07-16T19:20:30+01:00") if (null != time_start) { // Ensure it has day granularity, to help with aggregation e.put(AssociationPojo.time_start_, time_start); if (null != time_end) { e.put(AssociationPojo.time_end_, time_end); } } //TESTED } //(end if normal standalone mode, not aggregation simulation) StandaloneEventHashCode evtHolder = new StandaloneEventHashCode( standaloneEventAggregator.bSimulateAggregation, e, bIsSummary, bIsFact); BasicDBObject oldEvt = standaloneEventAggregator.store.get(evtHolder); if (null == oldEvt) { // Doc count (see below) e.put(AssociationPojo.doccount_, 1); double dAssocSig = dDocSig * dDocSig; // Weight down summaries slightly (80%), and summaries with missing entities a lot (50%) if (bIsSummary) { String sEntity2 = (String) e.get(AssociationPojo.entity2_); if (null == sEntity2) { dAssocSig *= 0.50; } else { dAssocSig *= 0.80; } } // Running significance count: e.put(AssociationPojo.assoc_sig_, dAssocSig); // (use sum-squared to score up events that occur frequently) if (dAssocSig > standaloneEventAggregator.dMaxSig) { standaloneEventAggregator.dMaxSig = dAssocSig; } standaloneEventAggregator.store.put(evtHolder, e); // Add to list in some sort of very basic order... if (2 == nPhase) { // Put at the back, it's probably really low sig standaloneEventAggregator.tmpList.add(e); } else if (1 == nPhase) { // Put at the front until Phase 0 comes along standaloneEventAggregator.tmpList.addFirst(e); standaloneEventAggregator.nPhase1Events++; } else { // phases 0 and 1 get the higher orderings standaloneEventAggregator.tmpList.addFirst(e); standaloneEventAggregator.nPhase0Events++; } } else { // Update doc count long nDocCount = oldEvt.getInt(AssociationPojo.doccount_, 1) + 1; oldEvt.put(AssociationPojo.doccount_, nDocCount); // Running significance count: double dAssocSig = oldEvt.getDouble(AssociationPojo.doccount_) + dDocSig * dDocSig; oldEvt.put(AssociationPojo.assoc_sig_, dAssocSig); if (dAssocSig / nDocCount > standaloneEventAggregator.dMaxSig) { standaloneEventAggregator.dMaxSig = dAssocSig; } if (bIsFact && !standaloneEventAggregator.bSimulateAggregation) { // For facts, also update the time range: String old_time_start = oldEvt.getString(AssociationPojo.time_start_); String old_time_end = oldEvt.getString(AssociationPojo.time_end_); // Just keep this really simple and inefficient: TreeSet<String> timeOrder = new TreeSet<String>(); if (null != old_time_start) { timeOrder.add(old_time_start); } if (null != old_time_end) { timeOrder.add(old_time_end); } if (null != time_start) { timeOrder.add(time_start); } if (null != time_end) { timeOrder.add(time_end); } if (timeOrder.size() > 1) { Iterator<String> itStart = timeOrder.iterator(); oldEvt.put(AssociationPojo.time_start_, itStart.next()); Iterator<String> itEnd = timeOrder.descendingIterator(); oldEvt.put(AssociationPojo.time_end_, itEnd.next()); } } // end if is fact - treat times different } //TESTED } // (end if keeping this event) } // (end loop over events) } // (end if this doc has events) }
From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java
License:Open Source License
private static boolean updateHarvestDistributionState_tokenComplete(SourcePojo source, HarvestEnum harvestStatus, BasicDBObject incClause, BasicDBObject setClause) { // Update tokens complete, and retrieve modified version int nTokensToBeCleared = source.getDistributionTokens().size(); BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId()); BasicDBObject modify = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject( SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, nTokensToBeCleared)); BasicDBObject fields = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 1);/*from www .j a va2 s .co m*/ fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1); fields.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, 1); BasicDBObject partial = (BasicDBObject) MongoDbManager.getIngest().getSource().findAndModify(query, fields, null, false, modify, true, false); //(return new version - ensures previous increments have been taken into account) // Two cases: source complete (all tokens obtained), source incomplete: if (null != partial) { // (else yikes!) BasicDBObject partialStatus = (BasicDBObject) partial.get(SourcePojo.harvest_); if (null != partialStatus) { // (else yikes!) int nTokensComplete = partialStatus.getInt(SourceHarvestStatusPojo.distributionTokensComplete_, 0); // (note after increment) // COMPLETE: reset parameters, status -> error (if anything has errored), success (all done), success_iteration (more to do) if (nTokensComplete == source.getDistributionFactor()) { if (!source.reachedMaxDocs()) { // (Can only do this if we've finished the source... //...else the different threads can be at different points, so the most recent doc for one thread might be // before the most recent doc of another) setClause.put(SourceHarvestStatusPojo.sourceQuery_distributedLastCompletedCycle_, new Date()); } setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0); setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_, source.getDistributionFactor()); setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false); // (resetting this) // This source is now complete String status = partialStatus.getString(SourceHarvestStatusPojo.harvest_status_, null); Boolean reachedLimit = partialStatus.getBoolean( SourceHarvestStatusPojo.distributionReachedLimit_, false) || source.reachedMaxDocs(); if ((null != status) && ((status.equalsIgnoreCase(HarvestEnum.error.toString()) || (HarvestEnum.error == harvestStatus)))) { setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.error.toString()); } //TESTED (current and previous state == error) else if (reachedLimit || (HarvestEnum.success_iteration == harvestStatus)) { setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.success_iteration.toString()); } //TESTED (from previous or current state) // (else leave with default of success) //DEBUG //System.out.println(Thread.currentThread().getName() + " COMPLETE_SRC COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete); return true; } //TESTED else { // Not complete // If we're here then we're only allowed to update the status to error if (HarvestEnum.error != harvestStatus) { setClause.remove(SourceHarvestStatusPojo.sourceQuery_harvest_status_); } //TESTED if (source.reachedMaxDocs()) { setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, true); } //TESTED //DEBUG //System.out.println(Thread.currentThread().getName() + " COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete); return false; } //(end is complete or not) //TESTED (reached max limit) } //(end found partial source status, else catastrophic failure) } //(end found partial source, else catastrophic failure) return false; }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner,/*from www. j a v a2 s. c om*/ InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }
From source file:com.ikanow.infinit.e.processing.custom.utils.CustomApiUtils.java
License:Apache License
public static void getJobResults(ResponsePojo rp, CustomMapReduceJobPojo cmr, int limit, String fields, String findStr, String sortStr, boolean bCsv) { BasicDBObject queryDbo = null;//from w w w. j av a2 s. c o m if (null != findStr) { queryDbo = (BasicDBObject) com.mongodb.util.JSON.parse(findStr); } else { queryDbo = new BasicDBObject(); } //TOTEST BasicDBObject fieldsDbo = new BasicDBObject(); if (null != fields) { fieldsDbo = (BasicDBObject) com.mongodb.util.JSON.parse("{" + fields + "}"); } //return the results: // Need to handle sorting... BasicDBObject sort = null; if (null != sortStr) { //override sort = (BasicDBObject) com.mongodb.util.JSON.parse(sortStr); } else { //defaults String sortField = "_id"; int sortDir = 1; BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse( InfiniteHadoopUtils.getQueryOrProcessing(cmr.query, InfiniteHadoopUtils.QuerySpec.POSTPROC)); if (postProcObject != null) { sortField = postProcObject.getString("sortField", "_id"); sortDir = postProcObject.getInt("sortDirection", 1); } //TESTED (post proc and no post proc) sort = new BasicDBObject(sortField, sortDir); } //TOTEST // Case 1: DB rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", true, "Map reduce job completed at: " + cmr.lastCompletionTime)); if ((null == cmr.exportToHdfs) || !cmr.exportToHdfs) { DBCursor resultCursor = null; DBCollection coll = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection); DBDecoderFactory defaultDecoder = coll.getDBDecoderFactory(); CsvGeneratingBsonDecoder csvDecoder = null; SizeReportingBasicBSONDecoder sizeDecoder = null; CustomMapReduceResultPojo cmrr = new CustomMapReduceResultPojo(); try { if (bCsv) { coll.setDBDecoderFactory((csvDecoder = new CsvGeneratingBsonDecoder())); } else { coll.setDBDecoderFactory((sizeDecoder = new SizeReportingBasicBSONDecoder())); } if (limit > 0) { resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort).limit(limit); } else { resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort); } LinkedList<BasicDBObject> list = null; if (!bCsv) { list = new LinkedList<BasicDBObject>(); } final int MAX_SIZE_CSV = 80 * 1024 * 1024; //(80MB) final int MAX_SIZE_JSON = 80 * 1024 * 1024; //(80MB) while (resultCursor.hasNext()) { BasicDBObject x = (BasicDBObject) resultCursor.next(); if (!bCsv) { list.add(x); } if (null != csvDecoder) { if (csvDecoder.getCsv().length() > MAX_SIZE_CSV) { break; } } else if (null != sizeDecoder) { if (sizeDecoder.getSize() > MAX_SIZE_JSON) { break; } } } cmrr.results = list; } finally { coll.setDBDecoderFactory(defaultDecoder); } cmrr.lastCompletionTime = cmr.lastCompletionTime; if (null != csvDecoder) { StringBuffer header = new StringBuffer(); for (String field : csvDecoder.getOrderedFields()) { if (0 != header.length()) { header.append(','); } header.append('"'); header.append(field.replace("\"", "\\\"")); header.append("\""); } header.append('\n'); header.append(csvDecoder.getCsv().toString()); cmrr.results = header.toString(); } rp.setData(cmrr); } //TESTED else { // Case 2: HDFS if ((null != cmr.outputKey) && (null != cmr.outputValue) && cmr.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && cmr.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // special case, text file try { rp.setData(HadoopUtils.getBsonFromTextFiles(cmr, limit, fields), (BasePojoApiMap<BasicDBList>) null); } catch (Exception e) { rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false, "Files don't appear to be in text file format, did you run the job before changing the output to Text/Text?")); } } //TESTED else { // sequence file try { rp.setData(HadoopUtils.getBsonFromSequenceFile(cmr, limit, fields), (BasePojoApiMap<BasicDBList>) null); } catch (Exception e) { rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false, "Files don't appear to be in sequence file format, did you run the job with Text/Text?")); } } //TESTED } //TESTED }
From source file:org.ossmeter.platform.mining.msr14.Extractor.java
License:Open Source License
public static void main(String[] args) throws Exception { long start = System.currentTimeMillis(); Mongo msrMongo = new Mongo(new ServerAddress("localhost", 1234)); // GitHub challenge data Mongo bioMongo = new Mongo(new ServerAddress("localhost", 12345));// Extracted data // Create indexes Biodiversity bio = new Biodiversity(bioMongo.getDB("biodiversity")); bio.setClearPongoCacheOnSync(true);/*from w w w . j av a2 s . c o m*/ bioMongo.getDB("biodiversity").getCollection("users").ensureIndex(new BasicDBObject("login", 1)); BasicDBObject index = new BasicDBObject(); index.put("name", 1); index.put("ownerName", 1); bioMongo.getDB("biodiversity").getCollection("projects").ensureIndex(index); index = new BasicDBObject(); index.put("projectName", 1); index.put("projectOwner", 1); bioMongo.getDB("biodiversity").getCollection("projectMemberships").ensureIndex(index); index = new BasicDBObject(); index.put("projectName", 1); index.put("userName", 1); bioMongo.getDB("biodiversity").getCollection("projectMemberships").ensureIndex(index); bioMongo.getDB("biodiversity").getCollection("projectMemberships") .ensureIndex(new BasicDBObject("userName", 1)); DB msrDb = msrMongo.getDB("msr14"); // #1 User extraction System.out.println("Extracting users..."); DBCursor cursor = msrDb.getCollection("users").find(); cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); Iterator<DBObject> it = cursor.iterator(); int count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // User user = new User(); // user.setGhId(obj.getString("id")); // user.setLogin(obj.getString("login")); // user.setLocation(obj.getString("location")); // user.setPublicRepos(obj.getInt("public_repos", 0)); // user.setJoinedDate(obj.getString("created_at")); // user.setFollowerCount(obj.getInt("followers", 0)); // user.setFollowingCount(obj.getInt("following", 0)); // user.setPublicGists(obj.getInt("public_gists", 0)); // // bio.getUsers().add(user); // // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // //// #1.2 Project extraction // System.out.println("Extracting projects..."); // cursor = msrDb.getCollection("repos").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // Project project = new Project(); // project.setName(obj.getString("name")); // project.setGhId(obj.getString("id")); // project.setCreatedAt(obj.getString("created_at")); // project.setSize(obj.getInt("size", 0)); // project.setWatchersCount(obj.getInt("watchers",0)); // project.setWatchersCount2(obj.getInt("watchers_count",0)); // project.setLanguage(obj.getString("language")); // project.setForks(obj.getInt("forks", 0)); // project.setForksCount(obj.getInt("forks_count", 0)); // project.setOpenIssues(obj.getInt("open_issues",0)); // project.setOpenIssuesCount(obj.getInt("open_issues_count",0)); // project.setOpenIssues(obj.getInt("open_issues",0)); // project.setNetworkCount(obj.getInt("network_count", 0)); // // BasicDBObject ownerObj = (BasicDBObject) obj.get("owner"); // User owner = null; // if (ownerObj != null) { // owner = bio.getUsers().findOne(User.LOGIN.eq(ownerObj.getString("login"))); // if (owner !=null) { // project.setOwner(owner); // project.setOwnerName(owner.getLogin()); // } // } // bio.getProjects().add(project); // // if (owner != null) { // This comes here as to reference the project, we need to have added to the project list first // ProjectMembership pm = getProjectMembership(bio, owner, project); // pm.setOwner(true); // } // // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // // System.exit(0); //// #2 Follower/following extraction // System.out.println("Extracting followers..."); // cursor = msrDb.getCollection("followers").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String followerLogin = obj.getString("login"); // String followedLogin = obj.getString("follows"); // // User follower = bio.getUsers().findOne(User.LOGIN.eq(followerLogin)); // User followed = bio.getUsers().findOne(User.LOGIN.eq(followedLogin)); // // if (follower != null && followed != null) { // follower.getFollowing().add(followed); // followed.getFollowers().add(follower); // } else{ //// System.err.println("Follower or followed is null. Follower: " +follower + ". followed: " + followed); // } // if (follower != null) follower.setFollowingCount(follower.getFollowingCount()+1); // if (followed != null) followed.setFollowerCount(followed.getFollowerCount()+1); // // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // // System.exit(0); System.out.println("Clearing ProjectMembership commit data"); for (ProjectMembership pm : bio.getProjectMemberships()) { pm.setCommitCount(0); pm.setCommitTotalChanges(0); pm.setCommitAdditions(0); pm.setCommitDeletions(0); pm.setCommitsAsAuthor(0); pm.setCommitsAsCommitter(0); pm.setCommitTotalFiles(0); pm.setAverageFilesPerCommit(0); pm.getCommitTimes().clear(); } bio.sync(); System.out.println("cleared."); // #3 Commits System.out.println("Extracting commits..."); cursor = msrDb.getCollection("commits").find(); cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); it = cursor.iterator(); count = 0; while (it.hasNext()) { BasicDBObject obj = (BasicDBObject) it.next(); // Author and committer BasicDBObject commitAuthor = (BasicDBObject) obj.get("author"); BasicDBObject commitCommitter = (BasicDBObject) obj.get("committer"); String authorLogin = ""; if (commitAuthor != null) authorLogin = commitAuthor.getString("login"); String committerLogin = ""; if (commitCommitter != null) committerLogin = commitCommitter.getString("login"); // Stats BasicDBObject stats = (BasicDBObject) obj.get("stats"); if (stats == null) stats = new BasicDBObject(); // Create a new one so we can get zeroed values int total = stats.getInt("total", 0); int additions = stats.getInt("additions", 0); int deletions = stats.getInt("deletions", 0); String commitDate = ((BasicDBObject) ((BasicDBObject) obj.get("commit")).get("author")) .getString("date"); BasicDBList files = (BasicDBList) obj.get("files"); String[] url = convertUrlIntoProjectNameAndOwner(obj.getString("url")); ProjectMembership authorPm = null; ProjectMembership committerPm = null; if (authorLogin != null) { authorPm = getProjectMembership(bio, authorLogin, url[1], url[0]); authorPm.setCommitCount(authorPm.getCommitCount() + 1); authorPm.setCommitTotalChanges(authorPm.getCommitTotalChanges() + total); authorPm.setCommitAdditions(authorPm.getCommitAdditions() + additions); authorPm.setCommitDeletions(authorPm.getCommitDeletions() + deletions); authorPm.setCommitsAsAuthor(authorPm.getCommitsAsAuthor() + 1); if (files != null) authorPm.setCommitTotalFiles(authorPm.getCommitTotalChanges() + files.size()); authorPm.setAverageFilesPerCommit(authorPm.getCommitTotalFiles() / authorPm.getCommitCount()); authorPm.getCommitTimes().add(commitDate); } if (authorLogin != null && !authorLogin.equals(committerLogin)) { committerPm = getProjectMembership(bio, committerLogin, url[1], url[0]); committerPm.setCommitCount(committerPm.getCommitCount() + 1); // committerPm.setCommitTotalChanges(committerPm.getCommitTotalChanges()+total); // committerPm.setCommitAdditions(committerPm.getCommitAdditions()+additions); // committerPm.setCommitDeletions(committerPm.getCommitDeletions()+deletions); committerPm.setCommitsAsCommitter(committerPm.getCommitsAsCommitter() + 1); committerPm.setCommitTotalFiles(committerPm.getCommitTotalChanges() + files.size()); committerPm.setAverageFilesPerCommit(committerPm.getCommitTotalFiles() / authorPm.getCommitCount()); if (files != null) committerPm.setCommitTotalFiles(committerPm.getCommitTotalChanges() + files.size()); committerPm .setAverageFilesPerCommit(committerPm.getCommitTotalFiles() / committerPm.getCommitCount()); committerPm.getCommitTimes().add(commitDate); } bio.sync(); count++; if (count % 1000 == 0) { System.out.print(count + ", "); bio.sync(); } } cursor.close(); bio.sync(); System.out.println(); System.exit(0); // if (author != null) { //// if (author.getCommits() ==null) author.setCommits(new Commits()); // author.setCommitCount(author.getCommitCount()+1); // author.setCommitTotalChanges(author.getCommitTotalChanges()+total); // author.setCommitAdditions(author.getCommitAdditions()+additions); // author.setCommitDeletions(author.getCommitDeletions()+deletions); // author.setCommitsAsAuthor(author.getCommitsAsAuthor()+1); // author.getCommitTimes().add(commitDate); // } // if (committer != null) { //// if (committer.getCommits() ==null) committer.setCommits(new Commits()); // committer.setCommitCount(committer.getCommitCount()+1); // committer.setCommitTotalChanges(committer.getCommitTotalChanges()+total); // committer.setCommitAdditions(committer.getCommitAdditions()+additions); // committer.setCommitDeletions(committer.getCommitDeletions()+deletions); // committer.setCommitsAsCommitter(committer.getCommitsAsCommitter()+1); // committer.getCommitTimes().add(commitDate); // } // // ProjectMembership authorPm = null; // ProjectMembership committerPm = null; // //// Only a very small number of commit comments actually reference the repo //// Instead we're going to have to strip the string // String[] url = convertUrlIntoProjectNameAndOwner(obj.getString("url")); // Project project = null; // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(url[1]), Project.OWNERNAME.eq(url[0])).iterator(); // if (repoIt.hasNext()) { // project = repoIt.next(); // if (project != null) { // // project.setCommitCount(project.getCommitCount()+1); // project.setCommitTotalChanges(project.getCommitTotalChanges()+total); // project.setCommitAdditions(project.getCommitAdditions()+additions); // project.setCommitDeletions(project.getCommitDeletions()+deletions); // project.getCommitTimes().add(commitDate); // // if (author != null) { // authorPm = getProjectMembership(bio, author, project); // authorPm.setCommitCount(authorPm.getCommitCount()+1); // authorPm.setCommitTotalChanges(authorPm.getCommitTotalChanges()+total); // authorPm.setCommitAdditions(authorPm.getCommitAdditions()+additions); // authorPm.setCommitDeletions(authorPm.getCommitDeletions()+deletions); // authorPm.setCommitsAsAuthor(authorPm.getCommitsAsAuthor()+1); // // // Avoid duplicating information // if (committer != null && author.getLogin().equals(committer.getLogin())) { // authorPm.setCommitsAsCommitter(authorPm.getCommitsAsCommitter()+1); // } // // authorPm.getCommitTimes().add(commitDate); // } // if (committer != null && author != null && !author.getLogin().equals(committer.getLogin())) { // committerPm = getProjectMembership(bio, committer, project); // committerPm.setCommitCount(committerPm.getCommitCount()+1); // committerPm.setCommitTotalChanges(committerPm.getCommitTotalChanges()+total); // committerPm.setCommitAdditions(committerPm.getCommitAdditions()+additions); // committerPm.setCommitDeletions(committerPm.getCommitDeletions()+deletions); // committerPm.setCommitsAsCommitter(committerPm.getCommitsAsCommitter()+1); // // committerPm.getCommitTimes().add(commitDate); // } // } // } // else { // System.err.println("Didn't find project:" + url[0] + ":"+url[1] + ", prestrip: " + obj.getString("url")); // } // bio.getProjectMemberships().sync(); // bio.sync(); // // // // Files // BasicDBList files = (BasicDBList) obj.get("files"); // if (files != null) { // for (Object f : files) { // BasicDBObject file = (BasicDBObject)f; // // String filename = file.getString("filename"); // if (filename.lastIndexOf(".") != -1) { // If it has an extension, we want that. If not, use the entire filename // filename = filename.substring(filename.lastIndexOf(".")); // filename = filename.toLowerCase(); // Ensure consistency // } // // FIXME: Should strip any /'s if there is no '.' - i.e. just the last one // // if (author != null) addArtefact(author, filename); // if (committer != null) addArtefact(committer, filename); //// if (project != null) addArtefact(project, filename); // } // } // // if (author != null && files !=null) { // author.setCommitTotalFiles(author.getCommitTotalFiles()+files.size()); // author.setAverageFilesPerCommit(author.getCommitTotalFiles()/author.getCommitCount()); // } // if (committer != null && files !=null && (author==null || !committer.getLogin().equals(author.getLogin()))) { // committer.setCommitTotalFiles(committer.getCommitTotalFiles()+files.size()); // committer.setAverageFilesPerCommit(committer.getCommitTotalFiles()/committer.getCommitCount()); // } // if (authorPm !=null && files != null) { // authorPm.setCommitTotalFiles(authorPm.getCommitTotalChanges()+files.size()); // authorPm.setAverageFilesPerCommit(authorPm.getCommitTotalFiles()/authorPm.getCommitCount()); // } // if (committerPm != null && files != null) { // committerPm.setCommitTotalFiles(committerPm.getCommitTotalChanges()+files.size()); // committerPm.setAverageFilesPerCommit(committerPm.getCommitTotalFiles()/committerPm.getCommitCount()); // } // // if (project!=null && files != null) { // project.setCommitTotalFiles(project.getCommitTotalChanges()+files.size()); // project.setAverageFilesPerCommit(project.getCommitTotalFiles()/project.getCommitCount()); // } // bio.getProjectMemberships().sync(); // bio.sync(); // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // cursor.close(); // bio.sync(); // System.out.println(); // // System.exit(0); //// #4 Commit comments // System.out.println("Extracting commit comments..."); // cursor = msrDb.getCollection("commit_comments").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String username = getUserLoginName(bio, "user", "login", obj); // User user = bio.getUsers().findOne(User.LOGIN.eq(username)); // if (user == null) { // System.err.println("Found commit comment with unrecognised user: " + username); // continue; // } // // user.setNumberOfCommitComments(user.getNumberOfCommitComments()+1); // //// if (!user.getDbObject().containsField("commitCommentTimes")) { //// user.getDbObject().put("commitCommentTimes", new BasicDBList()); //// } //// user.getCommitCommentTimes().add(obj.getString("created_at")); // // // Only a very small number of commit comments actually reference the repo // // Instead we're going to have to strip the string // String[] url = convertUrlIntoProjectNameAndOwner(obj.getString("url")); // //// System.out.println("Querying project " + url[1] + " and owner " + url[0]); // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(url[1]), Project.OWNERNAME.eq(url[0])).iterator(); //// if (repoIt.hasNext()) { // Project project = repoIt.next(); // if (project != null) { // project.setNumberOfCommitComments(project.getNumberOfCommitComments()+1); // // if (!project.getDbObject().containsField("commitCommentTimes")) { // project.getDbObject().put("commitCommentTimes", new BasicDBList()); // } // project.getCommitCommentTimes().add(obj.getString("created_at")); // // ProjectMembership pm = getProjectMembership(bio, user, project); // pm.setNumberOfCommitComments(pm.getNumberOfCommitComments()+1); // // if (!pm.getDbObject().containsField("commitCommentTimes")) { // pm.getDbObject().put("commitCommentTimes", new BasicDBList()); // } // pm.getCommitCommentTimes().add(obj.getString("created_at")); // } //// } // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // cursor.close(); // bio.sync(); // System.out.println(); // System.exit(0); //// //FIXME: THIS IS CAUSING THE CPU TO HIS 350% AND THEN KILLS THE LAPTOP?!?!?!?!? // #5 Pull requests System.out.println("Extracting pull requests..."); cursor = msrDb.getCollection("pull_requests").find(); cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); it = cursor.iterator(); System.out.println("Clearing previous data"); for (User u : bio.getUsers()) { if (!u.getDbObject().containsField("pullRequestTimes")) { u.getDbObject().put("pullRequestTimes", new BasicDBList()); } u.getPullRequestTimes().clear(); u.setNumberOfPullRequests(0); } bio.sync(); for (Project u : bio.getProjects()) { if (!u.getDbObject().containsField("pullRequestTimes")) { u.getDbObject().put("pullRequestTimes", new BasicDBList()); } u.getPullRequestTimes().clear(); u.setNumberOfPullRequests(0); } bio.sync(); for (ProjectMembership u : bio.getProjectMemberships()) { if (!u.getDbObject().containsField("pullRequestTimes")) { u.getDbObject().put("pullRequestTimes", new BasicDBList()); } u.getPullRequestTimes().clear(); u.setNumberOfPullRequests(0); } bio.sync(); System.out.println("Cleared!"); count = 0; while (it.hasNext()) { BasicDBObject obj = (BasicDBObject) it.next(); String username = getUserLoginName(bio, "user", "login", obj); User user = bio.getUsers().findOne(User.LOGIN.eq(username)); if (user == null) { // System.err.println("Found pull request with unrecognised user:" + username); continue; } if (!user.getDbObject().containsField("pullRequestTimes")) { user.getDbObject().put("pullRequestTimes", new BasicDBList()); } user.getPullRequestTimes().add(obj.getString("created_at")); user.setNumberOfPullRequests(user.getNumberOfPullRequests() + 1); // Project System.out.println(obj.getString("repo") + " " + obj.getString("owner") + obj.getString("_id")); ProjectMembership pm = getProjectMembership(bio, user.getLogin(), obj.getString("repo"), obj.getString("owner")); pm.setNumberOfPullRequests(pm.getNumberOfPullRequests() + 1); if (!pm.getDbObject().containsField("pullRequestTimes")) { pm.getDbObject().put("pullRequestTimes", new BasicDBList()); } pm.getPullRequestTimes().add(obj.getString("created_at")); // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(obj.getString("repo")), Project.OWNERNAME.eq(obj.getString("owner"))).iterator(); // if (repoIt.hasNext()) { // FIXME Causes it to run out of heap! // Project project = repoIt.next(); // if (project != null) { // project.setNumberOfPullRequests(project.getNumberOfPullRequests()+1); // // if (!project.getDbObject().containsField("pullRequestTimes")) { // project.getDbObject().put("pullRequestTimes", new BasicDBList()); // } // project.getPullRequestTimes().add(obj.getString("created_at")); // // } // } else { // System.err.println("Didn't find project:" + obj.getString("repo") + ":"+obj.getString("owner")); // } count++; if (count % 1000 == 0) { System.out.print(count + ", "); bio.sync(); System.gc(); } } bio.sync(); System.out.println(); System.exit(0); //// #6 Pull request comments // System.out.println("Extracting pull request comments..."); // cursor = msrDb.getCollection("pull_request_comments").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String username = getUserLoginName(bio, "user", "login", obj); // User user = bio.getUsers().findOne(User.LOGIN.eq(username)); // if (user == null) { //// System.err.println("Found pull request comment with unrecognised user:" + username); // continue; // } // // if (!user.getDbObject().containsField("pullRequestCommentTimes")) { // user.getDbObject().put("pullRequestCommentTimes", new BasicDBList()); // } // user.getPullRequestCommentTimes().add(obj.getString("created_at")); // user.setNumberOfPullRequestComments(user.getNumberOfPullRequestComments()+1); // // // Project // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(obj.getString("repo")), Project.OWNERNAME.eq(obj.getString("owner"))).iterator(); //// if (repoIt.hasNext()) { // Project project = repoIt.next(); // if (project != null) { // project.setNumberOfPullRequestComments(project.getNumberOfPullRequestComments()+1); // if (!project.getDbObject().containsField("pullRequestCommentTimes")) { // project.getDbObject().put("pullRequestCommentTimes", new BasicDBList()); // } // project.getPullRequestCommentTimes().add(obj.getString("created_at")); // // ProjectMembership pm = getProjectMembership(bio, user, project); // pm.setNumberOfPullRequestComments(pm.getNumberOfPullRequestComments()+1); // // if (!pm.getDbObject().containsField("pullRequestCommentTimes")) { // pm.getDbObject().put("pullRequestCommentTimes", new BasicDBList()); // } // pm.getPullRequestCommentTimes().add(obj.getString("created_at")); // } //// } // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // System.exit(0); //// #7 Issues // System.out.println("Extracting issues..."); // cursor = msrDb.getCollection("issues").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String username = getUserLoginName(bio, "user", "login", obj); // User user = bio.getUsers().findOne(User.LOGIN.eq(username)); // if (user == null) { //// System.err.println("Found issue with unrecognised user:" + username); // continue; // } // // if (!user.getDbObject().containsField("issueTimes")) { // user.getDbObject().put("issueTimes", new BasicDBList()); // } // user.getIssueTimes().add(obj.getString("created_at")); // user.setNumberOfIssues(user.getNumberOfIssues()+1); // // // Project // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(obj.getString("repo")), Project.OWNERNAME.eq(obj.getString("owner"))).iterator(); // if (repoIt.hasNext()) { // Project project = repoIt.next(); // if (project != null) { // project.setNumberOfIssues(project.getNumberOfIssues()+1); // // if (!project.getDbObject().containsField("issueTimes")) { // project.getDbObject().put("issueTimes", new BasicDBList()); // } // project.getIssueTimes().add(obj.getString("created_at")); // // ProjectMembership pm = getProjectMembership(bio, user, project); // pm.setNumberOfIssues(pm.getNumberOfIssues()+1); // // if (!pm.getDbObject().containsField("issueTimes")) { // pm.getDbObject().put("issueTimes", new BasicDBList()); // } // pm.getIssueTimes().add(obj.getString("created_at")); // } // } // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // System.exit(0); //// #8 Issue comments // System.out.println("Extracting issue comments..."); // cursor = msrDb.getCollection("issue_comments").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String username = getUserLoginName(bio, "user", "login", obj); // User user = bio.getUsers().findOne(User.LOGIN.eq(username)); // if (user == null) { //// System.err.println("Found issue comment with unrecognised user:" + username); // continue; // } // // if (!user.getDbObject().containsField("issueCommentTimes")) { // user.getDbObject().put("issueCommentTimes", new BasicDBList()); // } // user.getIssueCommentTimes().add(obj.getString("created_at")); // user.setNumberOfIssueComments(user.getNumberOfIssueComments()+1); // // // Project // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(obj.getString("repo")), Project.OWNERNAME.eq(obj.getString("owner"))).iterator(); // if (repoIt.hasNext()) { // Project project = repoIt.next(); // if (project != null) { // project.setNumberOfIssueComments(project.getNumberOfIssueComments()+1); // // if (!project.getDbObject().containsField("issueCommentTimes")) { // project.getDbObject().put("issueCommentTimes", new BasicDBList()); // } // project.getIssueCommentTimes().add(obj.getString("created_at")); // // ProjectMembership pm = getProjectMembership(bio, user, project); // pm.setNumberOfIssueComments(pm.getNumberOfIssueComments()+1); // // if (!pm.getDbObject().containsField("issueCommentTimes")) { // pm.getDbObject().put("issueCommentTimes", new BasicDBList()); // } // pm.getIssueCommentTimes().add(obj.getString("created_at")); // } // } // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // System.exit(0); //// #9 Issue events // System.out.println("Extracting issue events..."); // cursor = msrDb.getCollection("issue_events").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String username = getUserLoginName(bio, "actor", "login", obj); // User user = bio.getUsers().findOne(User.LOGIN.eq(username)); // if (user == null) { //// System.err.println("Found issue event with unrecognised user:" + username); // continue; // } // // String eventKind = obj.getString("event"); // IssueEventKind kind = null; //FIXME // // switch (eventKind) { // case "closed": kind = IssueEventKind.CLOSED; break; // case "assigned": kind = IssueEventKind.ASSIGNED; break; // case "mentioned": kind = IssueEventKind.MENTIONED; break; // case "merged": kind = IssueEventKind.MERGED; break; // case "referenced": kind = IssueEventKind.REFERENCED; break; // case "reopened": kind = IssueEventKind.REOPENED; break; // case "subscribed": kind = IssueEventKind.SUBSCRIBED; break; // case "head_ref_deleted" : kind = IssueEventKind.HEAD_REF_DELETED; break; // case "head_ref_restored" : kind = IssueEventKind.HEAD_REF_RESTORED; break; // case "head_ref_cleaned" : kind = IssueEventKind.HEAD_REF_CLEANED; break; // case "unsubscribed" : kind = IssueEventKind.UNSUBSCRIBED; break; // default: // System.err.println("Unrecognised issue event kind: " + eventKind); // } // if (kind == null) continue; // // boolean eventKindFound = false; // // if (!user.getDbObject().containsField("issueEvents")) { // user.getDbObject().put("issueEvents", new BasicDBList()); // } // // for (IssueEvent ie : user.getIssueEvents()) { // if (ie.getEventKind().equals(kind)) { // ie.setCount(ie.getCount()+1); // eventKindFound = true; // break; // } // } // if (!eventKindFound) { // IssueEvent ie = new IssueEvent(); // ie.setEventKind(kind); // ie.setCount(1); // user.getIssueEvents().add(ie); // } // // // Project // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(obj.getString("repo")), Project.OWNERNAME.eq(obj.getString("owner"))).iterator(); // if (repoIt.hasNext()) { // Project project = repoIt.next(); // // if (!project.getDbObject().containsField("issueEvents")) { // project.getDbObject().put("issueEvents", new BasicDBList()); // } // // eventKindFound = false; // for (IssueEvent ie : project.getIssueEvents()) { // if (ie.getEventKind().equals(kind)) { // ie.setCount(ie.getCount()+1); // eventKindFound = true; // break; // } // } // if (!eventKindFound) { // IssueEvent ie = new IssueEvent(); // ie.setEventKind(kind); // ie.setCount(1); // project.getIssueEvents().add(ie); // } // // ProjectMembership pm = getProjectMembership(bio, user, project); // // if (!pm.getDbObject().containsField("issueEvents")) { // pm.getDbObject().put("issueEvents", new BasicDBList()); // } // // eventKindFound = false; // for (IssueEvent ie : pm.getIssueEvents()) { // if (ie.getEventKind().equals(kind)) { // ie.setCount(ie.getCount()+1); // eventKindFound = true; // break; // } // } // if (!eventKindFound) { // IssueEvent ie = new IssueEvent(); // ie.setEventKind(kind); // ie.setCount(1); // pm.getIssueEvents().add(ie); // } // } // // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); // System.exit(0); // //// Watchers // System.out.println("Extracting watchers..."); // cursor = msrDb.getCollection("watchers").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // User user = bio.getUsers().findOne(User.LOGIN.eq(obj.getString("login"))); // if (user == null) continue; // // Iterator<Project> repoIt = bio.getProjects().find(Project.NAME.eq(obj.getString("repo")), Project.OWNERNAME.eq(obj.getString("owner"))).iterator(); // if (repoIt.hasNext()) { // Project project = repoIt.next(); // if (project != null && !project.getWatchers().contains(user)) project.getWatchers().add(user); // if (!user.getWatches().contains(project)) user.getWatches().add(project); // } // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // System.out.println(); //// Org members FIXME: INCOMPLETE: Cannot match the org name against ANYTHING.... // System.out.println("Extracting org members..."); // cursor = msrDb.getCollection("org_members").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String login = obj.getString("login"); // String orgName = obj.getString("org"); // // User user = bio.getUsers().findOne(User.LOGIN.eq(login)); // // User org = bio.getUsers().findOne(User.LOGIN.eq(orgName)); // // if (org!=null){ // System.err.println("Found org! " + orgName); // // } // //// Project project = bio.getProjects().findOne(Project.OWNERNAME.eq("orgName")); //// if (project==null) { //// System.err.println("Didn't find project: " + orgName); //// continue; //// } //// ProjectMembership pm = getProjectMembership(bio, user, project); //// pm.setOrgMember(true); // } // bio.sync(); // System.out.println(); // Repo collaborators // System.out.println("Extracting repo collaborators..."); // cursor = msrDb.getCollection("repo_collaborators").find(); // cursor.addOption(Bytes.QUERYOPTION_NOTIMEOUT); // it = cursor.iterator(); // // count = 0; // while(it.hasNext()){ // BasicDBObject obj = (BasicDBObject) it.next(); // // String login = obj.getString("login"); // String projectName = obj.getString("repo"); // String ownerName = obj.getString("owner"); // // User user = bio.getUsers().findOne(User.LOGIN.eq(login)); // Iterator<Project> repoIt = bio.getProjects().find(Project.OWNERNAME.eq(ownerName), Project.NAME.eq(projectName)).iterator(); // if (repoIt.hasNext()) { // Project project = repoIt.next(); // // ProjectMembership pm = getProjectMembership(bio, user, project); // pm.setCollaborator(true); // } else { // System.err.println("Couldn't find repo. owner: " + ownerName + ", repo: " + projectName); // } // count++; // if (count % 1000 == 0) { // System.out.print(count + ", "); // bio.sync(); // } // } // bio.sync(); // long end = System.currentTimeMillis(); System.out.println("Finished at " + new Date()); long duration = end - start; System.out.println("Duration: " + duration); }
From source file:tango.gui.DataManager.java
License:Open Source License
private void extractData() { HashMap<MultiKey, TreeSet<String>> newC2CKeys = new HashMap<MultiKey, TreeSet<String>>(); TreeSet<String> NucKeysToAdd = new TreeSet<String>(); DBCursor cur = mc.getXPNuclei(xp.getName()); cur.sort(new BasicDBObject("field_id", 1).append("idx", 1)); int nbNuc = cur.count(); IJ.log("extract data nb nuc:" + nbNuc); objectMes = new HashMap<Integer, TreeMap<MultiKey3D, String>>(ojectKeys.size()); nbObjects = new TreeMap<MultiKey2D, int[]>(); nucTags = new TreeMap<MultiKey2D, Integer>(); nucIds = new TreeMap<MultiKey2D, String>(); for (int i : ojectKeys.keySet()) { if (i < 0) { continue; }/*from w ww . jav a 2 s.c o m*/ objectMes.put(i, new TreeMap<MultiKey3D, String>()); } if (!ojectKeys.containsKey(0)) { objectMes.put(0, new TreeMap<MultiKey3D, String>()); ojectKeys.put(0, new TreeSet<String>()); } o2oMes = new HashMap<MultiKey, TreeMap<MultiKey4D, String>>(); for (MultiKey dk : c2cKeys.keySet()) { o2oMes.put(dk, new TreeMap<MultiKey4D, String>()); newC2CKeys.put(dk, new TreeSet<String>()); } while (cur.hasNext()) { BasicDBObject nuc = (BasicDBObject) cur.next(); if (nuc.getInt("tag", 0) < 0) { continue; // exclude negative tags } ObjectId nucId = (ObjectId) nuc.get("_id"); int nucIdx = nuc.getInt("idx"); String fieldName = mc.getField((ObjectId) nuc.get("field_id")).getString("name"); int[] nbPart = new int[channelNames.length]; //mesure objects for (int i = 0; i < channelNames.length; i++) { TreeMap<MultiKey3D, String> omes = objectMes.get(i); TreeSet<String> keys = ojectKeys.get(i); DBCursor cursor = mc.getObjectsCursor(nucId, i); cursor.sort(new BasicDBObject("idx", 1)); nbPart[i] = cursor.count(); if (keys != null && !keys.isEmpty()) { while (cursor.hasNext()) { BasicDBObject o = (BasicDBObject) cursor.next(); //IJ.log("o="+o); //IJ.log("omes="+omes); //IJ.log("f="+fieldName+" "+nucIdx+" "+o.getInt("idx")+" "+keys); for (String k : keys) { //IJ.log("k="+k+" "+o.getString(k)); if (o.getString(k) != null) { omes.put(new MultiKey3D(fieldName, nucIdx, o.getInt("idx"), k), o.get(k).toString()); } } } } cursor.close(); } String s = ""; for (int i : nbPart) { s += i + ";"; } //IJ.log("nb objects:" + s); MultiKey2D k2D = new MultiKey2D(fieldName, nucIdx, "nbParts"); nbObjects.put(k2D, nbPart); nucTags.put(k2D, nuc.getInt("tag", 0)); nucIds.put(k2D, nuc.getString("_id")); //C2C TreeMap<MultiKey3D, String> nucMes = objectMes.get(0); for (MultiKey dk : c2cKeys.keySet()) { if (dk.getKey(0) < 0) { continue; } int size = (dk.getKey(0) != dk.getKey(1)) ? nbPart[dk.getKey(0)] * nbPart[dk.getKey(1)] : nbPart[dk.getKey(0)] * (nbPart[dk.getKey(0)] - 1) / 2; BasicDBObject mes = mc.getMeasurementStructure(nucId, dk.getKeys(), true); //IJ.log("get mes:" + dk + " mes"); TreeMap<MultiKey4D, String> o2oMesDk = o2oMes.get(dk); TreeSet<String> keys = c2cKeys.get(dk); TreeSet<String> newKeys = newC2CKeys.get(dk); for (String k : keys) { Object o = mes.get(k); if (o instanceof BasicDBList) { BasicDBList list = ((BasicDBList) o); if (list.size() == size) { int count = 0; if (dk.getKey(0) != dk.getKey(1)) { for (int p1 = 1; p1 <= nbPart[dk.getKey(0)]; p1++) { for (int p2 = 1; p2 <= nbPart[dk.getKey(1)]; p2++) { o2oMesDk.put(new MultiKey4D(fieldName, nucIdx, p1, p2, k), list.get(count).toString()); count++; } } } else { for (int p1 = 1; p1 < nbPart[dk.getKey(0)]; p1++) { for (int p2 = p1 + 1; p2 <= nbPart[dk.getKey(1)]; p2++) { o2oMesDk.put(new MultiKey4D(fieldName, nucIdx, p1, p2, k), list.get(count).toString()); count++; } } } newKeys.add(k); } } else if (o instanceof Number || o instanceof String) { String newKey = channelNames[dk.getKey(0)] + "." + channelNames[dk.getKey(1)] + "." + k; nucMes.put(new MultiKey3D(fieldName, nucIdx, 1, newKey), o.toString()); NucKeysToAdd.add(newKey); } } } } cur.close(); this.ojectKeys.get(0).addAll(NucKeysToAdd); this.c2cKeys = newC2CKeys; }
From source file:tango.parameter.KeyParameter.java
License:Open Source License
@Override public void dbGet(BasicDBObject DBO) { Object sd = DBO.get(id);// ww w. j a v a 2 s . co m if (sd != null) { BasicDBObject subDBO = (BasicDBObject) sd; key.setText(subDBO.getString("name")); checkbox.setSelected(subDBO.getBoolean("do")); type = subDBO.getInt("type", 0); } }