List of usage examples for com.mongodb BasicDBObject get
public Object get(final String key)
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try {//from ww w.j a v a2 s . c o m BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return false; } } catch (Exception e) { //DEBUG //e.printStackTrace(); return false; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); keyFields.put(SourcePojo.highestDistributionFactorStored_, 1); // Get and remove the sourceKey information, incorporate into source query, // so it's nice and simple by the time it gets to the actual query Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ] else if (null != sourceKeyQueryTerm) { boolean sourceKeyQueryComplex = false; if (sourceKeyQueryTerm instanceof BasicDBObject) { BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm; if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) { if (!sourceKeyQueryTermEl.equals(DbManager.in_) && !sourceKeyQueryTermEl.equals(DbManager.lt_) && !sourceKeyQueryTermEl.equals(DbManager.lte_) && !sourceKeyQueryTermEl.equals(DbManager.gt_) && !sourceKeyQueryTermEl.equals(DbManager.gte_)) { sourceKeyQueryComplex = true; break; } //TESTED (eg ne) else if (sourceKeyQueryTermEl.equals(DbManager.in_) && (1 != sourceKeyQueryTermDbo.size())) { sourceKeyQueryComplex = true; break; } //TESTED ((lt,in)) } } //TESTED: (in, (gte,lt), ne) else { sourceKeyQueryComplex = true; } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } }) } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a sourceKeyQueryComplex = true; } //TESTED ($regex) if (sourceKeyQueryComplex) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below } else { return false; // already have a perfectly good source key specification } } //TESTED (See combinations above) DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields); int count = dbc.count(); if (count > 5000) { // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open) return false; } else { ArrayList<String> sources = new ArrayList<String>(count); while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor)); } if (sources.isEmpty()) { throw new RuntimeException(); // will just return no splits at all, no problem } //TESTED if (1 == sources.size()) { query.put(DocumentPojo.sourceKey_, sources.get(0)); } //TESTED else { query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources)); } //TESTED return true; } }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query, BasicDBObject srcTagsQuery, int maxCountPerTask) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try {//from w ww . j av a2 s.c om BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return null; } } catch (Exception e) { return null; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1); // Get and remove the sourceKey information, incorporate into source query: Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } //TESTED if (null != srcTagsQuery) { keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields); // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause) if (dbc.count() > 5000) { // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open) return null; } else { //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>(); // Build collections of objects of format { sourceKey: string or [], totalDocs } BasicDBList sourceKeyListCollection = new BasicDBList(); BasicDBList sourceKeyList = null; int runningDocs = 0; int runningSources = 0; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { long docCount = 0L; try { BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_); if (null != harvestStatus) { docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L); } } catch (Exception e) { } //DEBUG //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList); if (docCount > maxCountPerTask) { // source is large enough by itself // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKey); collection.put(SourceHarvestStatusPojo.doccount_, docCount); sourceKeyListCollection.add(collection); // (leaving running* alone, can keep building that) } //TESTED (by eye, system community of demo cluster) else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else { // (keep) build(ing) list if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); runningDocs += docCount; runningSources++; } //TESTED (by eye, system community of demo cluster) } //(end if has source key) } //(end loop over cursor) // Finish off: if (null != sourceKeyList) { // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs); sourceKeyListCollection.add(collection); } //TESTED (by eye, system community of demo cluster) if (sourceKeyListCollection.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } return sourceKeyListCollection; } // (end if too many source keys across the communities) }
From source file:com.ikanow.infinit.e.data_model.utils.JsonPrettyPrinter.java
License:Apache License
public static void jsonObjectToTextFormatted(BasicDBObject jsonObject, int indent, StringBuffer jsonText) { jsonText.append("{ "); appendNewLine(indent + 2, jsonText); Set<String> keys = new TreeSet<String>(jsonObject.keySet()); boolean isFirst = true; for (String key : keys) { Object val = jsonObject.get(key); if (null == val) continue; if (isFirst) { isFirst = false;// w w w .j av a2s. co m } else { jsonText.append(", "); appendNewLine(indent + 2, jsonText); } jsonText.append("\""); jsonText.append(key); jsonText.append("\" : "); jsonObjectToTextFormatted(val, indent + 2, jsonText); } appendNewLine(indent, jsonText); jsonText.append("}"); }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
/** * Tests to see if duplicates might exist. * If it is not a duplicate, true is returned. If it is a duplicate, * the modified date is then checked to see if the file has been updated. * True is returned if the file has been updated, false otherwise. * /*from w w w. ja v a2 s .c om*/ * @param collection * @param modifiedDate * @param url * @param title * @return boolean (true/false) */ public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try { if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other... //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source) Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo .get(DocumentPojo.created_); if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvested() && (null != mostRecentlyModifedFile_createdTime))) { if (mostRecentlyModifedFile_createdTime .after(source.getHarvestStatus().getHarvested())) { _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; } } else { // If we don't have a date then force a "slow" dedup _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; } } //TESTED } //(found docs) } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED else if (null == sourceUrl) { return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting) } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.sourceUrl_, sourceUrl); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).hint(hint).limit(1); // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified) //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky if (!dbc.hasNext()) { //if there is no record, return true ret = true; modifiedDate.setTime(0); } else { // (all docs should have same modified, though this is ~ time ordered anyway) BasicDBObject dbo = (BasicDBObject) dbc.iterator().next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where) // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs // once an RPM >=5955 is deployed this will no longer be necessary) } return ret; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try {/* w w w. j av a2 s .co m*/ if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); if (null != source.getDistributionFactor()) { // if distributed, then apply extra term if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) { Date d = source.getHarvestStatus().getDistributedLastCompletedCycle(); mostRecentQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, new ObjectId(d))); } } //TESTED BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); } //TESTED (found docs) //DEBUG //if (null != _mostRecentlyModifiedDocId) // System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime())); } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED if (null == url) { // use this call with url==null to just check the modified file... return true; } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, url); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1) boolean foundMatch = dbc.hasNext(); if (!foundMatch) { //if there is no record, return true ret = true; } else { BasicDBObject dbo = (BasicDBObject) dbc.next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match if (!dbc.hasNext()) { // 1 matching doc, different modified times so update ret = true; } //TESTED else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur) // (slightly slow but should be OK because not going to happen very often) int nCount = dbc.count(); query.put(DocumentPojo.modified_, modifiedDate); ret = !(collection.find(query).limit(1).count() == nCount); } //TOTEST (shouldn't ever occur) } else { // Doc has same modified time so don't update ret = false; } //TESTED } return ret; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
private String isFunctionalDuplicate(SourcePojo source, LinkedList<String> candidateSourceKeys) { // (Ensure everything's set up) if (null == _sameConfigurationSources) { _sameConfigurationSources = new TreeSet<String>(); _differentConfigurationSources = new TreeSet<String>(); _sameCommunitySources = new TreeSet<String>(); }/*from w w w. j a v a2s .c o m*/ if (null == source.getShah256Hash()) { source.generateShah256Hash(); } // See if we've cached something: String returnVal = null; Iterator<String> it = candidateSourceKeys.iterator(); while (it.hasNext()) { String sourceKey = it.next(); if (!source.getDuplicateExistingUrls()) { // Check _sameCommunitySources: ignore+carry on if sourceKey isn't in here, else // return sourceKey, which will treat as a non-update duplicate (non update because // the update params only set if it was an update duplicate) if (_sameCommunitySources.contains(sourceKey)) { return source.getKey(); // (ie return fake source key that will cause above logic to occur) } } //TESTED if (sourceKey.equalsIgnoreCase(source.getKey())) { return sourceKey; // (the calling function will then treat it as a duplicate) } else if (_sameConfigurationSources.contains(sourceKey)) { returnVal = sourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive) } else if (_differentConfigurationSources.contains(sourceKey)) { it.remove(); // (don't need to check this source out) } } //TESTED boolean bMatchedInCommunity = false; // (duplication logic below) if ((null == returnVal) && !candidateSourceKeys.isEmpty()) { // Need to query the DB for this source... BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_, source.getShah256Hash()); query.put(SourcePojo.key_, new BasicDBObject(MongoDbManager.in_, candidateSourceKeys.toArray())); BasicDBObject fields = new BasicDBObject(SourcePojo._id_, 0); fields.put(SourcePojo.key_, 1); if (!source.getDuplicateExistingUrls()) { fields.put(SourcePojo.communityIds_, 1); } DBCursor dbc = DbManager.getIngest().getSource().find(query, fields); while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sSourceKey = dbo.getString(SourcePojo.key_); // DON'T DEDUP LOGIC: if (!source.getDuplicateExistingUrls()) { BasicDBList communities = (BasicDBList) dbo.get(SourcePojo.communityIds_); for (Object communityIdObj : communities) { ObjectId communityId = (ObjectId) communityIdObj; if (source.getCommunityIds().contains(communityId)) { // Not allowed to duplicate off this _sameCommunitySources.add(sSourceKey); bMatchedInCommunity = true; } } } //(end "don't duplicate existing URLs logic") //TESTED (same community and different communities) if (null != sSourceKey) { _sameConfigurationSources.add(sSourceKey); returnVal = sSourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive) } } // Loop over config sources again to work out which keys can now be placed in the "_differentConfigurationSources" cache for (String sourceKey : candidateSourceKeys) { if (!_sameConfigurationSources.contains(sourceKey)) { _differentConfigurationSources.add(sourceKey); } } } //TESTED if (bMatchedInCommunity) { return source.getKey(); // (ie return fake source key that will cause above logic to occur) } else { return returnVal; } }
From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java
License:Open Source License
@Override public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) { if (_isDirectory) { if (_isShare) { // must be a zip file ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>(); @SuppressWarnings("unchecked") Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries(); while (entries.hasMoreElements()) { net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement(); InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName()); zipFiles.add(newFile);//from w ww . j a v a2 s. co m } return zipFiles.toArray(new InfiniteFile[zipFiles.size()]); } //TESTED (3.2) else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory" String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_); String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_); if (null == outputDatabase) { outputDatabase = "custommr"; } DBCollection outColl = null; DBCursor dbc = null; if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory DBCollection chunks = MongoDbManager.getCollection("config", "chunks"); StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection); dbc = chunks.find(new BasicDBObject("ns", ns.toString())); int splits = dbc.count(); if (splits < 2) { // Nothing to do (unsharded or 1 chunk) dbc.close(); outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(); } //TESTED (4.2) else { // Create one virtual dir per split InfiniteFile[] virtualDirs = new InfiniteFile[splits]; int added = 0; for (DBObject splitObj : dbc) { BasicDBObject minObj = (BasicDBObject) splitObj.get("min"); BasicDBObject maxObj = (BasicDBObject) splitObj.get("max"); ObjectId minId = null; try { minId = (ObjectId) minObj.get("_id"); } catch (Exception e) { } // min key.. ObjectId maxId = null; try { maxId = (ObjectId) maxObj.get("_id"); } catch (Exception e) { } // max key.. //Handle current case where custom jobs are all dumped in with the wrong _id type if ((null != minId) || (null != maxId)) { if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below) if (maxId.getTime() < optionalFilterDate.getTime()) { // (the "getTime()"s can overlap across chunks so we have to use minId // and accept that we'll often deserialize 1+ extra chunk every harvest) continue; } } //TESTED (by hand) InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId); virtualDirs[added] = split; added++; } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand) } dbc.close(); return virtualDirs; } //TESTED (5.2.2, 6.2.2) } //TESTED else { // Virtual directory BasicDBObject query = new BasicDBObject(); if (null != _virtualDirStartLimit) { if (null != optionalFilterDate) { ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); //(zero out the inc/machine ids so this query is independent to calling service) if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit) query.put(MongoDbManager.gte_, altStartId); } else { query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } //TESTED (by hand) else { // normal case query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable) ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); query.put(MongoDbManager.gte_, altStartId); } //TESTED (by hand) if (null != _virtualDirEndLimit) { query.put(MongoDbManager.lt_, _virtualDirEndLimit); } outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle); } //TESTED (6.2.2) (doc skipping by hand) if (null != outColl) { // has files, create the actual file objects //DEBUG //System.out.println("CHUNK: GOT " + dbc.count()); int docCount = dbc.count(); if (docCount > 1 + maxDocsPerCycle) { docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway) } InfiniteFile[] docs = new InfiniteFile[docCount]; int added = 0; for (DBObject docObj : dbc) { // (if didn't use a query then apply internal filter date by hand) if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit) && (null != optionalFilterDate)) { ObjectId docId = (ObjectId) docObj.get("_id"); if (optionalFilterDate.getTime() > docId.getTime()) { continue; } } //TESTED if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to) docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"), _virtualDirEndLimit); break; } else { InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj); docs[added] = doc; } //TESTED (both cases) added++; } dbc.close(); return docs; } //TESTED (4.2) } } else { // can just return myself InfiniteFile[] retVal = new InfiniteFile[1]; retVal[0] = this; return retVal; } //TESTED (1.2, 2.2) return null; }
From source file:com.ikanow.infinit.e.harvest.HarvestController.java
License:Open Source License
private static String getDocumentContentFromWhichToDuplicate(DocumentPojo docToReplace) { try {/*from ww w . j a va 2 s . co m*/ // Get the full text: byte[] storageArray = new byte[200000]; BasicDBObject contentQ = new BasicDBObject("url", docToReplace.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, docToReplace.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); BasicDBObject dboContent = (BasicDBObject) DbManager.getDocument().getContent().findOne(contentQ, fields); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } return output.toString(); } else { // Will just need to-reprocess this document return null; } } catch (Exception e) { // Do nothing, just carry on e.printStackTrace(); } return null; }
From source file:com.ikanow.infinit.e.harvest.HarvestControllerPipeline.java
License:Open Source License
private void splitDocuments(DocumentPojo doc, SourcePojo source, SourcePipelinePojo splitter, List<DocumentPojo> docs) { try {/* ww w.j a v a 2 s . c o m*/ if (null == source.getRssConfig()) { source.setRssConfig(new SourceRssConfigPojo()); } if (null != source.getRssConfig().getExtraUrls()) { // refreshed ready for new document source.getRssConfig().setExtraUrls(null); } HashMap<String, Object> jsonLookup = new HashMap<String, Object>(); if ((null != splitter.splitter.getScriptlang()) && splitter.splitter.getScriptlang().startsWith("automatic")) { // (automatic or automatic_json or automatic_xml) String[] args = splitter.splitter.getScript().split("\\s*,\\s*"); Object[] objList = null; String field = args[0]; if (field.startsWith(DocumentPojo.fullText_)) { // fullText, or fullText.[x] where [x] is the root value DocumentPojo dummyDoc = new DocumentPojo(); dummyDoc.setFullText(doc.getFullText()); MetadataSpecPojo dummyContent = new MetadataSpecPojo(); dummyContent.fieldName = "extract"; dummyContent.scriptlang = "stream"; dummyContent.flags = "o"; if (field.equals(DocumentPojo.fullText_)) { // fullText dummyContent.script = ""; } else { dummyContent.script = field.substring(1 + DocumentPojo.fullText_.length()); //+1 for the "." } _uah.processMetadataChain(dummyDoc, Arrays.asList(dummyContent), source.getRssConfig(), null); BasicDBObject dummyDocDbo = (BasicDBObject) dummyDoc.toDb(); dummyDocDbo = (BasicDBObject) dummyDocDbo.get(DocumentPojo.metadata_); if (null != dummyDocDbo) { objList = ((Collection<?>) (dummyDocDbo.get("extract"))).toArray(); // (returns a list of strings) } } //TESTED (doc_splitter_test_auto_json, json: test3, xml: test4) else if (field.startsWith(DocumentPojo.metadata_)) { // field starts with "metadata." objList = doc.getMetadata().get(field.substring(1 + DocumentPojo.metadata_.length())); //+1 for the "." } //TESTED (doc_splitter_test_auto_json, test1) else { // direct reference to metadata field objList = doc.getMetadata().get(field); } //TESTED (doc_splitter_test_auto_json, test2) if ((null != objList) && (objList.length > 0)) { source.getRssConfig().setExtraUrls(new ArrayList<ExtraUrlPojo>(objList.length)); int num = 0; for (Object o : objList) { num++; ExtraUrlPojo url = new ExtraUrlPojo(); if ((1 == args.length) || !(o instanceof DBObject)) { // generate default URL url.url = doc.getUrl() + "#" + num; } //TESTED (doc_splitter_test_auto_json, test1) else if (2 == args.length) { // url specified in the format <fieldname-in-dot-notation> url.url = MongoDbUtil.getProperty((DBObject) o, args[1]); } //TESTED (doc_splitter_test_auto_json, test2) else { // url specified in format <message-format-with-{1}-{2}-etc>,<fieldname-in-dot-notation-for-1>,.. ArrayList<Object> cmdArgs = new ArrayList<Object>(args.length - 1); //-2 + 1 (+1 - see below) cmdArgs.add("[INDEX_FROM_1_NOT_0]"); for (int j = 2; j < args.length; ++j) { cmdArgs.add(MongoDbUtil.getProperty((DBObject) o, args[j])); } url.url = MessageFormat.format(args[1], cmdArgs.toArray()); } //TESTED (doc_splitter_test_auto_json, test3, test4) if (null == url.url) { // (if we can't extract a URL then bail out) continue; } url.title = new StringBuffer(doc.getTitle()).append(" (").append(num).append(")") .toString(); url.fullText = o.toString(); source.getRssConfig().getExtraUrls().add(url); if (splitter.splitter.getScriptlang().startsWith("automatic_")) { // automatic_json or automatic_xml jsonLookup.put(url.url, o); } } } //TESTED (doc_splitter_test_auto_json) } else { // normal case - run the 'follow web links' code to get the docs source.getRssConfig().setSearchConfig(splitter.splitter); FeedHarvester_searchEngineSubsystem subsys = new FeedHarvester_searchEngineSubsystem(); subsys.generateFeedFromSearch(source, _hc, doc); } if (null != source.getRssConfig().getExtraUrls()) { for (ExtraUrlPojo newDocInfo : source.getRssConfig().getExtraUrls()) { if (null == doc.getSourceUrl()) { // (if sourceUrl != null, bypass it's because it's been generated by a file so is being deleted anyway) //(note: this null check above is relied upon by the federated query engine, so don't go randomly changing it!) if (_hc.getDuplicateManager().isDuplicate_Url(newDocInfo.url, source, null)) { //TODO: should handle updateCycle_secs? continue; } } DocumentPojo newDoc = new DocumentPojo(); newDoc.setCreated(doc.getCreated()); newDoc.setModified(doc.getModified()); newDoc.setUrl(newDocInfo.url); newDoc.setTitle(newDocInfo.title); newDoc.setDescription(newDocInfo.description); newDoc.setFullText(newDocInfo.fullText); // For JSON, also create the metadata) if (null != splitter.splitter.getScriptlang()) { if (splitter.splitter.getScriptlang().equals("automatic_json")) { newDoc.addToMetadata("json", jsonLookup.get(newDoc.getUrl())); } else if (splitter.splitter.getScriptlang().equals("automatic_xml")) { Object obj = jsonLookup.get(newDoc.getUrl()); if (obj instanceof DBObject) { DBObject dbo = (DBObject) obj; for (String key : dbo.keySet()) { Object objArray = dbo.get(key); if (objArray instanceof Object[]) { newDoc.addToMetadata(key, (Object[]) objArray); } else if (objArray instanceof Collection<?>) { newDoc.addToMetadata(key, ((Collection<?>) objArray).toArray()); } } } //(test4) } } //TESTED (doc_splitter_test_auto_json, test1:json, test4:xml) // Published date is a bit more complex if (null != newDocInfo.publishedDate) { try { newDoc.setPublishedDate(new Date(DateUtility.parseDate(newDocInfo.publishedDate))); } catch (Exception e) { } } //TESTED (test3,test4) if (null == newDoc.getPublishedDate()) { newDoc.setPublishedDate(doc.getPublishedDate()); } //TESTED (test1) if (null == newDoc.getPublishedDate()) { newDoc.setPublishedDate(doc.getCreated()); } //TESTED (test2) newDoc.setTempSource(source); newDoc.setSource(doc.getSource()); newDoc.setMediaType(doc.getMediaType()); newDoc.setSourceKey(doc.getSourceKey()); newDoc.setSourceUrl(doc.getSourceUrl()); // (otherwise won't be able to delete child docs that come from a file) newDoc.setCommunityId(doc.getCommunityId()); newDoc.setDocGeo(doc.getDocGeo()); newDoc.setIndex(doc.getIndex()); newDoc.setSpawnedFrom(splitter); docs.add(newDoc); } //end loop over URLs } //TESTED } catch (Exception e) { StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e); _hc.getHarvestStatus().logMessage(errMessage.toString(), true); } //TESTED (test4) }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner,//from w w w .j a v a2 s. c o m InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }