Example usage for com.mongodb BasicDBObject get

List of usage examples for com.mongodb BasicDBObject get

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject get.

Prototype

public Object get(final String key) 

Source Link

Document

Gets a value from this object

Usage

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {//from ww  w.j a  v  a2 s  .  c o  m
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return false;
        }
    } catch (Exception e) {
        //DEBUG
        //e.printStackTrace();

        return false; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    keyFields.put(SourcePojo.highestDistributionFactorStored_, 1);

    // Get and remove the sourceKey information, incorporate into source query,
    // so it's nice and simple by the time it gets to the actual query
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);

    if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless 
        if (null != sourceKeyQueryTerm) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
        }
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ]
    else if (null != sourceKeyQueryTerm) {
        boolean sourceKeyQueryComplex = false;

        if (sourceKeyQueryTerm instanceof BasicDBObject) {
            BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm;
            if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte
                for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) {
                    if (!sourceKeyQueryTermEl.equals(DbManager.in_)
                            && !sourceKeyQueryTermEl.equals(DbManager.lt_)
                            && !sourceKeyQueryTermEl.equals(DbManager.lte_)
                            && !sourceKeyQueryTermEl.equals(DbManager.gt_)
                            && !sourceKeyQueryTermEl.equals(DbManager.gte_)) {
                        sourceKeyQueryComplex = true;
                        break;
                    } //TESTED (eg ne)
                    else if (sourceKeyQueryTermEl.equals(DbManager.in_)
                            && (1 != sourceKeyQueryTermDbo.size())) {
                        sourceKeyQueryComplex = true;
                        break;
                    } //TESTED ((lt,in))
                }
            } //TESTED: (in, (gte,lt), ne)
            else {
                sourceKeyQueryComplex = true;
            } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } })
        } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a
            sourceKeyQueryComplex = true;
        }
        //TESTED ($regex)

        if (sourceKeyQueryComplex) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below
        } else {
            return false; // already have a perfectly good source key specification
        }
    } //TESTED (See combinations above)

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields);
    int count = dbc.count();

    if (count > 5000) {
        // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open)
        return false;
    } else {
        ArrayList<String> sources = new ArrayList<String>(count);
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
            sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor));
        }
        if (sources.isEmpty()) {
            throw new RuntimeException(); // will just return no splits at all, no problem
        } //TESTED
        if (1 == sources.size()) {
            query.put(DocumentPojo.sourceKey_, sources.get(0));
        } //TESTED
        else {
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
        } //TESTED

        return true;
    }
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
        BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {//from   w  ww  . j av  a2  s.c om
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return null;
        }
    } catch (Exception e) {
        return null; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    } //TESTED
    if (null != srcTagsQuery) {
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
    // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
    if (dbc.count() > 5000) {
        // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
        return null;
    } else {
        //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
        // Build collections of objects of format { sourceKey: string or [], totalDocs }
        BasicDBList sourceKeyListCollection = new BasicDBList();
        BasicDBList sourceKeyList = null;
        int runningDocs = 0;
        int runningSources = 0;
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            if (null != sourceKey) {
                long docCount = 0L;
                try {
                    BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                    if (null != harvestStatus) {
                        docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    }
                } catch (Exception e) {
                }

                //DEBUG
                //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                if (docCount > maxCountPerTask) { // source is large enough by itself
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKey);
                    collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                    sourceKeyListCollection.add(collection);
                    // (leaving running* alone, can keep building that)
                } //TESTED (by eye, system community of demo cluster)
                else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else { // (keep) build(ing) list
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    runningDocs += docCount;
                    runningSources++;
                } //TESTED (by eye, system community of demo cluster)
            } //(end if has source key)
        } //(end loop over cursor)

        // Finish off:
        if (null != sourceKeyList) {
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
            sourceKeyListCollection.add(collection);
        } //TESTED (by eye, system community of demo cluster)

        if (sourceKeyListCollection.isEmpty()) { // query returns empty
            throw new RuntimeException("Communities contain no sources");
        }
        return sourceKeyListCollection;

    } // (end if too many source keys across the communities)
}

From source file:com.ikanow.infinit.e.data_model.utils.JsonPrettyPrinter.java

License:Apache License

public static void jsonObjectToTextFormatted(BasicDBObject jsonObject, int indent, StringBuffer jsonText) {
    jsonText.append("{ ");
    appendNewLine(indent + 2, jsonText);
    Set<String> keys = new TreeSet<String>(jsonObject.keySet());
    boolean isFirst = true;
    for (String key : keys) {
        Object val = jsonObject.get(key);
        if (null == val)
            continue;
        if (isFirst) {
            isFirst = false;// w w w .j av a2s. co m
        } else {
            jsonText.append(", ");
            appendNewLine(indent + 2, jsonText);
        }
        jsonText.append("\"");
        jsonText.append(key);
        jsonText.append("\" : ");
        jsonObjectToTextFormatted(val, indent + 2, jsonText);
    }
    appendNewLine(indent, jsonText);
    jsonText.append("}");
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

/**
 * Tests to see if duplicates might exist.
 * If it is not a duplicate, true is returned. If it is a duplicate,
 * the modified date is then checked to see if the file has been updated.
 * True is returned if the file has been updated, false otherwise.
 * /*from w w w.  ja  v  a2 s  .c  om*/
 * @param collection
 * @param modifiedDate
 * @param url
 * @param title
 * @return boolean (true/false)
 */
public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                    if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other...
                        //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything
                        //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source)
                        Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo
                                .get(DocumentPojo.created_);
                        if ((null != source.getHarvestStatus())
                                && (null != source.getHarvestStatus().getHarvested()
                                        && (null != mostRecentlyModifedFile_createdTime))) {
                            if (mostRecentlyModifedFile_createdTime
                                    .after(source.getHarvestStatus().getHarvested())) {
                                _mostRecentlyModifiedFile = null;
                                _mostRecentlyModifiedDocId = null;
                            }
                        } else { // If we don't have a date then force a "slow" dedup
                            _mostRecentlyModifiedFile = null;
                            _mostRecentlyModifiedDocId = null;
                        }
                    } //TESTED
                } //(found docs)
            } //(success mode)
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)         
    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED
    else if (null == sourceUrl) {
        return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting)
    }

    // No short cut, go the long way round:      

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.sourceUrl_, sourceUrl);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2);
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).hint(hint).limit(1);
    // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified)
    //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky

    if (!dbc.hasNext()) { //if there is no record, return true
        ret = true;
        modifiedDate.setTime(0);
    } else { // (all docs should have same modified, though this is ~ time ordered anyway)

        BasicDBObject dbo = (BasicDBObject) dbc.iterator().next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl
        // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where)
        // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs
        //  once an RPM >=5955 is deployed this will no longer be necessary)
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {/*  w w  w.  j  av  a2 s .co  m*/
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                if (null != source.getDistributionFactor()) { // if distributed, then apply extra term
                    if ((null != source.getHarvestStatus())
                            && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) {
                        Date d = source.getHarvestStatus().getDistributedLastCompletedCycle();
                        mostRecentQuery.put(DocumentPojo._id_,
                                new BasicDBObject(DbManager.lte_, new ObjectId(d)));
                    }
                } //TESTED

                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                } //TESTED (found docs)

                //DEBUG
                //if (null != _mostRecentlyModifiedDocId)
                //   System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime()));

            } //(success mode)            
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)

    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...         
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED

    if (null == url) { // use this call with url==null to just check the modified file...
        return true;
    }

    // No short cut, go the long way round:

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, url);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1)
    boolean foundMatch = dbc.hasNext();

    if (!foundMatch) { //if there is no record, return true
        ret = true;
    } else {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match
            if (!dbc.hasNext()) { // 1 matching doc, different modified times so update
                ret = true;
            } //TESTED
            else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur)
                // (slightly slow but should be OK because not going to happen very often)               
                int nCount = dbc.count();
                query.put(DocumentPojo.modified_, modifiedDate);
                ret = !(collection.find(query).limit(1).count() == nCount);
            } //TOTEST (shouldn't ever occur)         
        } else { // Doc has same modified time so don't update
            ret = false;
        } //TESTED
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

private String isFunctionalDuplicate(SourcePojo source, LinkedList<String> candidateSourceKeys) {
    // (Ensure everything's set up)
    if (null == _sameConfigurationSources) {
        _sameConfigurationSources = new TreeSet<String>();
        _differentConfigurationSources = new TreeSet<String>();
        _sameCommunitySources = new TreeSet<String>();
    }/*from   w w  w.  j  a v  a2s .c  o  m*/
    if (null == source.getShah256Hash()) {
        source.generateShah256Hash();
    }

    // See if we've cached something:
    String returnVal = null;
    Iterator<String> it = candidateSourceKeys.iterator();
    while (it.hasNext()) {
        String sourceKey = it.next();

        if (!source.getDuplicateExistingUrls()) {
            // Check _sameCommunitySources: ignore+carry on if sourceKey isn't in here, else 
            // return sourceKey, which will treat as a non-update duplicate (non update because 
            // the update params only set if it was an update duplicate)
            if (_sameCommunitySources.contains(sourceKey)) {
                return source.getKey(); // (ie return fake source key that will cause above logic to occur)
            }
        } //TESTED

        if (sourceKey.equalsIgnoreCase(source.getKey())) {
            return sourceKey; // (the calling function will then treat it as a duplicate)
        } else if (_sameConfigurationSources.contains(sourceKey)) {
            returnVal = sourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive)
        } else if (_differentConfigurationSources.contains(sourceKey)) {
            it.remove(); // (don't need to check this source out)
        }
    } //TESTED
    boolean bMatchedInCommunity = false; // (duplication logic below)
    if ((null == returnVal) && !candidateSourceKeys.isEmpty()) {

        // Need to query the DB for this source...         
        BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_, source.getShah256Hash());
        query.put(SourcePojo.key_, new BasicDBObject(MongoDbManager.in_, candidateSourceKeys.toArray()));
        BasicDBObject fields = new BasicDBObject(SourcePojo._id_, 0);
        fields.put(SourcePojo.key_, 1);
        if (!source.getDuplicateExistingUrls()) {
            fields.put(SourcePojo.communityIds_, 1);
        }
        DBCursor dbc = DbManager.getIngest().getSource().find(query, fields);
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sSourceKey = dbo.getString(SourcePojo.key_);

            // DON'T DEDUP LOGIC:
            if (!source.getDuplicateExistingUrls()) {
                BasicDBList communities = (BasicDBList) dbo.get(SourcePojo.communityIds_);
                for (Object communityIdObj : communities) {
                    ObjectId communityId = (ObjectId) communityIdObj;
                    if (source.getCommunityIds().contains(communityId)) { // Not allowed to duplicate off this
                        _sameCommunitySources.add(sSourceKey);
                        bMatchedInCommunity = true;
                    }
                }
            } //(end "don't duplicate existing URLs logic")
              //TESTED (same community and different communities)

            if (null != sSourceKey) {
                _sameConfigurationSources.add(sSourceKey);
                returnVal = sSourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive)
            }
        }
        // Loop over config sources again to work out which keys can now be placed in the "_differentConfigurationSources" cache
        for (String sourceKey : candidateSourceKeys) {
            if (!_sameConfigurationSources.contains(sourceKey)) {
                _differentConfigurationSources.add(sourceKey);
            }
        }
    } //TESTED
    if (bMatchedInCommunity) {
        return source.getKey(); // (ie return fake source key that will cause above logic to occur)
    } else {
        return returnVal;
    }

}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java

License:Open Source License

@Override
public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) {
    if (_isDirectory) {
        if (_isShare) { // must be a zip file
            ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>();
            @SuppressWarnings("unchecked")
            Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries();
            while (entries.hasMoreElements()) {
                net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement();
                InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName());
                zipFiles.add(newFile);//from w  ww  .  j  a  v  a2 s.  co  m
            }
            return zipFiles.toArray(new InfiniteFile[zipFiles.size()]);
        } //TESTED (3.2)
        else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory"
            String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_);
            String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_);
            if (null == outputDatabase) {
                outputDatabase = "custommr";
            }
            DBCollection outColl = null;
            DBCursor dbc = null;
            if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory

                DBCollection chunks = MongoDbManager.getCollection("config", "chunks");
                StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection);
                dbc = chunks.find(new BasicDBObject("ns", ns.toString()));
                int splits = dbc.count();

                if (splits < 2) { // Nothing to do (unsharded or 1 chunk)
                    dbc.close();

                    outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                    dbc = outColl.find();
                } //TESTED (4.2)
                else { // Create one virtual dir per split
                    InfiniteFile[] virtualDirs = new InfiniteFile[splits];
                    int added = 0;
                    for (DBObject splitObj : dbc) {
                        BasicDBObject minObj = (BasicDBObject) splitObj.get("min");
                        BasicDBObject maxObj = (BasicDBObject) splitObj.get("max");
                        ObjectId minId = null;
                        try {
                            minId = (ObjectId) minObj.get("_id");
                        } catch (Exception e) {
                        } // min key..
                        ObjectId maxId = null;
                        try {
                            maxId = (ObjectId) maxObj.get("_id");
                        } catch (Exception e) {
                        } // max key..

                        //Handle current case where custom jobs are all dumped in with the wrong _id type                     
                        if ((null != minId) || (null != maxId)) {
                            if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below)

                                if (maxId.getTime() < optionalFilterDate.getTime()) {
                                    // (the "getTime()"s can overlap across chunks so we have to use minId
                                    //  and accept that we'll often deserialize 1+ extra chunk every harvest)
                                    continue;
                                }
                            } //TESTED (by hand)

                            InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId);
                            virtualDirs[added] = split;
                            added++;
                        } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand)
                    }
                    dbc.close();
                    return virtualDirs;
                } //TESTED (5.2.2, 6.2.2)
            } //TESTED
            else { // Virtual directory
                BasicDBObject query = new BasicDBObject();
                if (null != _virtualDirStartLimit) {
                    if (null != optionalFilterDate) {
                        ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                        //(zero out the inc/machine ids so this query is independent to calling service)

                        if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit)
                            query.put(MongoDbManager.gte_, altStartId);
                        } else {
                            query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                        }
                    } //TESTED (by hand)
                    else { // normal case
                        query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                    }
                } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable)
                    ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                    query.put(MongoDbManager.gte_, altStartId);
                } //TESTED (by hand)
                if (null != _virtualDirEndLimit) {
                    query.put(MongoDbManager.lt_, _virtualDirEndLimit);
                }

                outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle);
            } //TESTED (6.2.2) (doc skipping by hand)

            if (null != outColl) { // has files, create the actual file objects
                //DEBUG
                //System.out.println("CHUNK: GOT " + dbc.count());

                int docCount = dbc.count();
                if (docCount > 1 + maxDocsPerCycle) {
                    docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway)
                }
                InfiniteFile[] docs = new InfiniteFile[docCount];
                int added = 0;
                for (DBObject docObj : dbc) {
                    // (if didn't use a query then apply internal filter date by hand)
                    if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)
                            && (null != optionalFilterDate)) {
                        ObjectId docId = (ObjectId) docObj.get("_id");
                        if (optionalFilterDate.getTime() > docId.getTime()) {
                            continue;
                        }
                    } //TESTED

                    if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to)
                        docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"),
                                _virtualDirEndLimit);
                        break;
                    } else {
                        InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj);
                        docs[added] = doc;
                    } //TESTED (both cases)
                    added++;
                }
                dbc.close();
                return docs;

            } //TESTED (4.2)
        }
    } else { // can just return myself
        InfiniteFile[] retVal = new InfiniteFile[1];
        retVal[0] = this;
        return retVal;
    } //TESTED (1.2, 2.2)
    return null;
}

From source file:com.ikanow.infinit.e.harvest.HarvestController.java

License:Open Source License

private static String getDocumentContentFromWhichToDuplicate(DocumentPojo docToReplace) {
    try {/*from  ww  w  .  j  a va 2  s  .  co m*/
        // Get the full text:
        byte[] storageArray = new byte[200000];
        BasicDBObject contentQ = new BasicDBObject("url", docToReplace.getUrl());
        contentQ.put(CompressedFullTextPojo.sourceKey_,
                new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, docToReplace.getSourceKey())));
        BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
        BasicDBObject dboContent = (BasicDBObject) DbManager.getDocument().getContent().findOne(contentQ,
                fields);
        if (null != dboContent) {
            byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
            ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
            GZIPInputStream gzip = new GZIPInputStream(in);
            int nRead = 0;
            StringBuffer output = new StringBuffer();
            while (nRead >= 0) {
                nRead = gzip.read(storageArray, 0, 200000);
                if (nRead > 0) {
                    String s = new String(storageArray, 0, nRead, "UTF-8");
                    output.append(s);
                }
            }
            return output.toString();
        } else { // Will just need to-reprocess this document
            return null;
        }
    } catch (Exception e) {
        // Do nothing, just carry on
        e.printStackTrace();
    }
    return null;
}

From source file:com.ikanow.infinit.e.harvest.HarvestControllerPipeline.java

License:Open Source License

private void splitDocuments(DocumentPojo doc, SourcePojo source, SourcePipelinePojo splitter,
        List<DocumentPojo> docs) {
    try {/*  ww  w.j a v a  2  s .  c o m*/
        if (null == source.getRssConfig()) {
            source.setRssConfig(new SourceRssConfigPojo());
        }
        if (null != source.getRssConfig().getExtraUrls()) { // refreshed ready for new document
            source.getRssConfig().setExtraUrls(null);
        }

        HashMap<String, Object> jsonLookup = new HashMap<String, Object>();
        if ((null != splitter.splitter.getScriptlang())
                && splitter.splitter.getScriptlang().startsWith("automatic")) {
            // (automatic or automatic_json or automatic_xml)

            String[] args = splitter.splitter.getScript().split("\\s*,\\s*");
            Object[] objList = null;

            String field = args[0];
            if (field.startsWith(DocumentPojo.fullText_)) { // fullText, or fullText.[x] where [x] is the root value

                DocumentPojo dummyDoc = new DocumentPojo();
                dummyDoc.setFullText(doc.getFullText());
                MetadataSpecPojo dummyContent = new MetadataSpecPojo();
                dummyContent.fieldName = "extract";
                dummyContent.scriptlang = "stream";
                dummyContent.flags = "o";

                if (field.equals(DocumentPojo.fullText_)) { // fullText
                    dummyContent.script = "";
                } else {
                    dummyContent.script = field.substring(1 + DocumentPojo.fullText_.length()); //+1 for the "."
                }
                _uah.processMetadataChain(dummyDoc, Arrays.asList(dummyContent), source.getRssConfig(), null);

                BasicDBObject dummyDocDbo = (BasicDBObject) dummyDoc.toDb();
                dummyDocDbo = (BasicDBObject) dummyDocDbo.get(DocumentPojo.metadata_);
                if (null != dummyDocDbo) {
                    objList = ((Collection<?>) (dummyDocDbo.get("extract"))).toArray(); // (returns a list of strings)
                }
            } //TESTED (doc_splitter_test_auto_json, json: test3, xml: test4)
            else if (field.startsWith(DocumentPojo.metadata_)) { // field starts with "metadata."
                objList = doc.getMetadata().get(field.substring(1 + DocumentPojo.metadata_.length())); //+1 for the "."               
            } //TESTED (doc_splitter_test_auto_json, test1)
            else { // direct reference to metadata field
                objList = doc.getMetadata().get(field);
            } //TESTED (doc_splitter_test_auto_json, test2)

            if ((null != objList) && (objList.length > 0)) {
                source.getRssConfig().setExtraUrls(new ArrayList<ExtraUrlPojo>(objList.length));
                int num = 0;
                for (Object o : objList) {
                    num++;
                    ExtraUrlPojo url = new ExtraUrlPojo();
                    if ((1 == args.length) || !(o instanceof DBObject)) { // generate default URL
                        url.url = doc.getUrl() + "#" + num;
                    } //TESTED (doc_splitter_test_auto_json, test1)
                    else if (2 == args.length) { // url specified in the format <fieldname-in-dot-notation>
                        url.url = MongoDbUtil.getProperty((DBObject) o, args[1]);
                    } //TESTED (doc_splitter_test_auto_json, test2)
                    else { // url specified in format <message-format-with-{1}-{2}-etc>,<fieldname-in-dot-notation-for-1>,..
                        ArrayList<Object> cmdArgs = new ArrayList<Object>(args.length - 1); //-2 + 1 (+1 - see below)
                        cmdArgs.add("[INDEX_FROM_1_NOT_0]");
                        for (int j = 2; j < args.length; ++j) {
                            cmdArgs.add(MongoDbUtil.getProperty((DBObject) o, args[j]));
                        }
                        url.url = MessageFormat.format(args[1], cmdArgs.toArray());
                    } //TESTED (doc_splitter_test_auto_json, test3, test4)

                    if (null == url.url) { // (if we can't extract a URL then bail out)
                        continue;
                    }

                    url.title = new StringBuffer(doc.getTitle()).append(" (").append(num).append(")")
                            .toString();
                    url.fullText = o.toString();
                    source.getRssConfig().getExtraUrls().add(url);
                    if (splitter.splitter.getScriptlang().startsWith("automatic_")) { // automatic_json or automatic_xml
                        jsonLookup.put(url.url, o);
                    }
                }
            } //TESTED (doc_splitter_test_auto_json)
        } else { // normal case - run the 'follow web links' code to get the docs
            source.getRssConfig().setSearchConfig(splitter.splitter);

            FeedHarvester_searchEngineSubsystem subsys = new FeedHarvester_searchEngineSubsystem();
            subsys.generateFeedFromSearch(source, _hc, doc);
        }
        if (null != source.getRssConfig().getExtraUrls()) {
            for (ExtraUrlPojo newDocInfo : source.getRssConfig().getExtraUrls()) {
                if (null == doc.getSourceUrl()) { // (if sourceUrl != null, bypass it's because it's been generated by a file so is being deleted anyway)
                    //(note: this null check above is relied upon by the federated query engine, so don't go randomly changing it!) 

                    if (_hc.getDuplicateManager().isDuplicate_Url(newDocInfo.url, source, null)) {
                        //TODO: should handle updateCycle_secs?
                        continue;
                    }
                }
                DocumentPojo newDoc = new DocumentPojo();
                newDoc.setCreated(doc.getCreated());
                newDoc.setModified(doc.getModified());
                newDoc.setUrl(newDocInfo.url);
                newDoc.setTitle(newDocInfo.title);
                newDoc.setDescription(newDocInfo.description);
                newDoc.setFullText(newDocInfo.fullText);

                // For JSON, also create the metadata)
                if (null != splitter.splitter.getScriptlang()) {
                    if (splitter.splitter.getScriptlang().equals("automatic_json")) {
                        newDoc.addToMetadata("json", jsonLookup.get(newDoc.getUrl()));
                    } else if (splitter.splitter.getScriptlang().equals("automatic_xml")) {
                        Object obj = jsonLookup.get(newDoc.getUrl());
                        if (obj instanceof DBObject) {
                            DBObject dbo = (DBObject) obj;
                            for (String key : dbo.keySet()) {
                                Object objArray = dbo.get(key);
                                if (objArray instanceof Object[]) {
                                    newDoc.addToMetadata(key, (Object[]) objArray);
                                } else if (objArray instanceof Collection<?>) {
                                    newDoc.addToMetadata(key, ((Collection<?>) objArray).toArray());
                                }
                            }
                        } //(test4)
                    }
                } //TESTED (doc_splitter_test_auto_json, test1:json, test4:xml)

                // Published date is a bit more complex
                if (null != newDocInfo.publishedDate) {
                    try {
                        newDoc.setPublishedDate(new Date(DateUtility.parseDate(newDocInfo.publishedDate)));
                    } catch (Exception e) {
                    }
                } //TESTED (test3,test4)
                if (null == newDoc.getPublishedDate()) {
                    newDoc.setPublishedDate(doc.getPublishedDate());
                } //TESTED (test1)
                if (null == newDoc.getPublishedDate()) {
                    newDoc.setPublishedDate(doc.getCreated());
                } //TESTED (test2)
                newDoc.setTempSource(source);
                newDoc.setSource(doc.getSource());
                newDoc.setMediaType(doc.getMediaType());
                newDoc.setSourceKey(doc.getSourceKey());
                newDoc.setSourceUrl(doc.getSourceUrl()); // (otherwise won't be able to delete child docs that come from a file)
                newDoc.setCommunityId(doc.getCommunityId());
                newDoc.setDocGeo(doc.getDocGeo());
                newDoc.setIndex(doc.getIndex());

                newDoc.setSpawnedFrom(splitter);
                docs.add(newDoc);
            } //end loop over URLs
        } //TESTED
    } catch (Exception e) {
        StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
        _hc.getHarvestStatus().logMessage(errMessage.toString(), true);
    } //TESTED (test4)

}

From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java

License:Open Source License

@SuppressWarnings({ "unchecked", "rawtypes" })
public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation)
        throws IOException, SAXException, ParserConfigurationException {
    StringWriter xml = new StringWriter();
    String outputCollection = job.outputCollectionTemp;// (non-append mode) 
    if ((null != job.appendResults) && job.appendResults)
        outputCollection = job.outputCollection; // (append mode, write directly in....)
    else if (null != job.incrementalMode)
        job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode)

    createConfigXML(xml, job.jobtitle, job.inputCollection,
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS),
            job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper,
            job.reducer, job.combiner,//from w  w w .j a  v a2 s. c o  m
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY),
            job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode,
            job.submitterID, job.selfMerge, job.outputCollection, job.appendResults);

    ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();

    URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
            savedClassLoader);
    Thread.currentThread().setContextClassLoader(child);

    // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable
    boolean dataModelLoaded = true;
    try {
        URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
                null);
        try {
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest);
        } catch (ClassNotFoundException e2) {
            //(this is fine, will use the cached version)
            dataModelLoaded = false;
        }
        if (dataModelLoaded)
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest);
    } catch (ClassNotFoundException e1) {
        throw new RuntimeException(
                "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards");
    }

    // Now load the XML into a configuration object: 
    Configuration config = new Configuration();
    // Add the client configuration overrides:
    if (!bLocalMode) {
        String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/";
        config.addResource(new Path(hadoopConfigPath + "core-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "mapred-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml"));
    } //TESTED

    try {
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes()));
        NodeList nList = doc.getElementsByTagName("property");

        for (int temp = 0; temp < nList.getLength(); temp++) {
            Node nNode = nList.item(temp);
            if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                Element eElement = (Element) nNode;
                String name = getTagValue("name", eElement);
                String value = getTagValue("value", eElement);
                if ((null != name) && (null != value)) {
                    config.set(name, value);
                }
            }
        }
    } catch (Exception e) {
        throw new IOException(e.getMessage());
    }

    // Some other config defaults:
    // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config)
    config.set("mapred.map.tasks.speculative.execution", "false");
    config.set("mapred.reduce.tasks.speculative.execution", "false");
    // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera)

    // Now run the JAR file
    try {
        BasicDBObject advancedConfigurationDbo = null;
        try {
            advancedConfigurationDbo = (null != job.query)
                    ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query))
                    : (new BasicDBObject());
        } catch (Exception e) {
            advancedConfigurationDbo = new BasicDBObject();
        }
        boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable;
        if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) {
            throw new RuntimeException(
                    "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead.");
        }

        config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
        if (bLocalMode) { // local job tracker and FS mode
            config.set("mapred.job.tracker", "local");
            config.set("fs.default.name", "local");
        } else {
            if (bTestMode) { // run job tracker locally but FS mode remotely
                config.set("mapred.job.tracker", "local");
            } else { // normal job tracker
                String trackerUrl = HadoopUtils.getXMLProperty(
                        props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker");
                config.set("mapred.job.tracker", trackerUrl);
            }
            String fsUrl = HadoopUtils.getXMLProperty(
                    props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
            config.set("fs.default.name", fsUrl);
        }
        if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves
            Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.data_model.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
            jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.processing.custom.library.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
        } //TESTED

        // Debug scripts (only if they exist), and only in non local/test mode
        if (!bLocalMode && !bTestMode) {

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_map_error_handler.sh", config);
                config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_reduce_error_handler.sh", config);
                config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

        } //TODO (???): TOTEST

        // (need to do these 2 things here before the job is created, at which point the config class has been copied across)
        //1)
        Class<?> mapperClazz = Class.forName(job.mapper, true, child);
        if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook
            ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz
                    .newInstance();
            preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode));
        } //TESTED
          //2)
        if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {
            // Need to download the GridFSZip file
            try {
                Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/",
                        "GridFSZipFile.jar", config);
                DistributedCache.addFileToClassPath(jarToCache, config);
            } catch (Throwable t) {
            } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!)            
        }

        if (job.inputCollection.equals("records")) {

            InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo);

            //(won't run under 0.19 so running with "records" should cause all sorts of exceptions)

        } //TESTED (by hand)         

        if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
            config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
        }

        // Manually specified caches
        List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"),
                job, config, props_custom);

        Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)
        try {

            if (null != localJarCaches) {
                if (bLocalMode || bTestMode) {
                    Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class });
                    method.setAccessible(true);
                    method.invoke(child, localJarCaches.toArray());

                } //TOTEST (tested logically)
            }
            Class<?> classToLoad = Class.forName(job.mapper, true, child);
            hj.setJarByClass(classToLoad);

            if (job.inputCollection.equalsIgnoreCase("filesystem")) {
                String inputPath = null;
                try {
                    inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    if (!inputPath.endsWith("/")) {
                        inputPath = inputPath + "/";
                    }
                } catch (Exception e) {
                }
                if (null == inputPath) {
                    throw new RuntimeException("Must specify 'file.url' if reading from filesystem.");
                }
                inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath);

                InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive)
                InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB)
                InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child));
            } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {

                String[] oidStrs = null;
                try {
                    String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)");
                    Matcher m = oidExtractor.matcher(inputPath);
                    if (m.find()) {
                        oidStrs = m.group(1).split("\\s*,\\s*");

                    } else {
                        throw new RuntimeException(
                                "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath);
                    }
                    InfiniteHadoopUtils.authenticateShareList(job, oidStrs);
                } catch (Exception e) {
                    throw new RuntimeException(
                            "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e);
                }

                hj.getConfiguration().setStrings("mapred.input.dir", oidStrs);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child));
            } else if (job.inputCollection.equals("records")) {
                hj.setInputFormatClass((Class<? extends InputFormat>) Class
                        .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child));
            } else {
                if (esMode) {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat",
                            true, child));
                } else {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child));
                }
            }
            if ((null != job.exportToHdfs) && job.exportToHdfs) {

                //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?)

                Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom);

                if ((null != job.outputKey) && (null != job.outputValue)
                        && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                        && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
                    // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text)
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
                            .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child));
                    TextOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
                else {
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child));
                    SequenceFileOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
            } else { // normal case, stays in MongoDB
                hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child));
            }
            hj.setMapperClass((Class<? extends Mapper>) mapperClazz);
            String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null);
            if (null != mapperOutputKeyOverride) {
                hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride));
            } //TESTED 

            String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null);
            if (null != mapperOutputValueOverride) {
                hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride));
            } //TESTED 

            if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null")
                    && !job.reducer.equalsIgnoreCase("none")) {
                hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child));
                // Variable reducers:
                if (null != job.query) {
                    try {
                        hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1));
                    } catch (Exception e) {
                        try {
                            // (just check it's not a string that is a valid int)
                            hj.setNumReduceTasks(
                                    Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1")));
                        } catch (Exception e2) {
                        }
                    }
                } //TESTED
            } else {
                hj.setNumReduceTasks(0);
            }
            if ((null != job.combiner) && !job.combiner.startsWith("#")
                    && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) {
                hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child));
            }
            hj.setOutputKeyClass(Class.forName(job.outputKey, true, child));
            hj.setOutputValueClass(Class.forName(job.outputValue, true, child));

            hj.setJobName(job.jobtitle);
            currJobName = job.jobtitle;
        } catch (Error e) { // (messing about with class loaders = lots of chances for errors!)
            throw new RuntimeException(e.getMessage(), e);
        }
        if (bTestMode || bLocalMode) {
            hj.submit();
            currThreadId = null;
            Logger.getRootLogger().addAppender(this);
            currLocalJobId = hj.getJobID().toString();
            currLocalJobErrs.setLength(0);
            while (!hj.isComplete()) {
                Thread.sleep(1000);
            }
            Logger.getRootLogger().removeAppender(this);
            if (hj.isSuccessful()) {
                if (this.currLocalJobErrs.length() > 0) {
                    return "local_done: " + this.currLocalJobErrs.toString();
                } else {
                    return "local_done";
                }
            } else {
                return "Error: " + this.currLocalJobErrs.toString();
            }
        } else {
            hj.submit();
            String jobId = hj.getJobID().toString();
            return jobId;
        }
    } catch (Exception e) {
        e.printStackTrace();
        Thread.currentThread().setContextClassLoader(savedClassLoader);
        return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e);
    } finally {
        Thread.currentThread().setContextClassLoader(savedClassLoader);
    }
}