Example usage for com.mongodb BasicDBObject get

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject get.

Prototype

public Object get(final String key)

Source Link

Document

Gets a value from this object

Usage

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {//from ww  w.j a  v  a2 s  .  c o  m
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return false;
        }
    } catch (Exception e) {
        //DEBUG
        //e.printStackTrace();

        return false; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    keyFields.put(SourcePojo.highestDistributionFactorStored_, 1);

    // Get and remove the sourceKey information, incorporate into source query,
    // so it's nice and simple by the time it gets to the actual query
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);

    if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless 
        if (null != sourceKeyQueryTerm) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
        }
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ]
    else if (null != sourceKeyQueryTerm) {
        boolean sourceKeyQueryComplex = false;

        if (sourceKeyQueryTerm instanceof BasicDBObject) {
            BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm;
            if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte
                for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) {
                    if (!sourceKeyQueryTermEl.equals(DbManager.in_)
                            && !sourceKeyQueryTermEl.equals(DbManager.lt_)
                            && !sourceKeyQueryTermEl.equals(DbManager.lte_)
                            && !sourceKeyQueryTermEl.equals(DbManager.gt_)
                            && !sourceKeyQueryTermEl.equals(DbManager.gte_)) {
                        sourceKeyQueryComplex = true;
                        break;
                    } //TESTED (eg ne)
                    else if (sourceKeyQueryTermEl.equals(DbManager.in_)
                            && (1 != sourceKeyQueryTermDbo.size())) {
                        sourceKeyQueryComplex = true;
                        break;
                    } //TESTED ((lt,in))
                }
            } //TESTED: (in, (gte,lt), ne)
            else {
                sourceKeyQueryComplex = true;
            } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } })
        } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a
            sourceKeyQueryComplex = true;
        }
        //TESTED ($regex)

        if (sourceKeyQueryComplex) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below
        } else {
            return false; // already have a perfectly good source key specification
        }
    } //TESTED (See combinations above)

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields);
    int count = dbc.count();

    if (count > 5000) {
        // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open)
        return false;
    } else {
        ArrayList<String> sources = new ArrayList<String>(count);
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
            sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor));
        }
        if (sources.isEmpty()) {
            throw new RuntimeException(); // will just return no splits at all, no problem
        } //TESTED
        if (1 == sources.size()) {
            query.put(DocumentPojo.sourceKey_, sources.get(0));
        } //TESTED
        else {
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
        } //TESTED

        return true;
    }
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
        BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {//from   w  ww  . j av  a2  s.c om
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return null;
        }
    } catch (Exception e) {
        return null; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    } //TESTED
    if (null != srcTagsQuery) {
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
    // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
    if (dbc.count() > 5000) {
        // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
        return null;
    } else {
        //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
        // Build collections of objects of format { sourceKey: string or [], totalDocs }
        BasicDBList sourceKeyListCollection = new BasicDBList();
        BasicDBList sourceKeyList = null;
        int runningDocs = 0;
        int runningSources = 0;
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            if (null != sourceKey) {
                long docCount = 0L;
                try {
                    BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                    if (null != harvestStatus) {
                        docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    }
                } catch (Exception e) {
                }

                //DEBUG
                //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                if (docCount > maxCountPerTask) { // source is large enough by itself
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKey);
                    collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                    sourceKeyListCollection.add(collection);
                    // (leaving running* alone, can keep building that)
                } //TESTED (by eye, system community of demo cluster)
                else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else { // (keep) build(ing) list
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    runningDocs += docCount;
                    runningSources++;
                } //TESTED (by eye, system community of demo cluster)
            } //(end if has source key)
        } //(end loop over cursor)

        // Finish off:
        if (null != sourceKeyList) {
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
            sourceKeyListCollection.add(collection);
        } //TESTED (by eye, system community of demo cluster)

        if (sourceKeyListCollection.isEmpty()) { // query returns empty
            throw new RuntimeException("Communities contain no sources");
        }
        return sourceKeyListCollection;

    } // (end if too many source keys across the communities)
}

From source file:com.ikanow.infinit.e.data_model.utils.JsonPrettyPrinter.java

License:Apache License

public static void jsonObjectToTextFormatted(BasicDBObject jsonObject, int indent, StringBuffer jsonText) {
    jsonText.append("{ ");
    appendNewLine(indent + 2, jsonText);
    Set<String> keys = new TreeSet<String>(jsonObject.keySet());
    boolean isFirst = true;
    for (String key : keys) {
        Object val = jsonObject.get(key);
        if (null == val)
            continue;
        if (isFirst) {
            isFirst = false;// w w w .j av a2s. co m
        } else {
            jsonText.append(", ");
            appendNewLine(indent + 2, jsonText);
        }
        jsonText.append("\"");
        jsonText.append(key);
        jsonText.append("\" : ");
        jsonObjectToTextFormatted(val, indent + 2, jsonText);
    }
    appendNewLine(indent, jsonText);
    jsonText.append("}");
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

/**
 * Tests to see if duplicates might exist.
 * If it is not a duplicate, true is returned. If it is a duplicate,
 * the modified date is then checked to see if the file has been updated.
 * True is returned if the file has been updated, false otherwise.
 * /*from w w w.  ja  v  a2 s  .c  om*/
 * @param collection
 * @param modifiedDate
 * @param url
 * @param title
 * @return boolean (true/false)
 */
public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                    if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other...
                        //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything
                        //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source)
                        Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo
                                .get(DocumentPojo.created_);
                        if ((null != source.getHarvestStatus())
                                && (null != source.getHarvestStatus().getHarvested()
                                        && (null != mostRecentlyModifedFile_createdTime))) {
                            if (mostRecentlyModifedFile_createdTime
                                    .after(source.getHarvestStatus().getHarvested())) {
                                _mostRecentlyModifiedFile = null;
                                _mostRecentlyModifiedDocId = null;
                            }
                        } else { // If we don't have a date then force a "slow" dedup
                            _mostRecentlyModifiedFile = null;
                            _mostRecentlyModifiedDocId = null;
                        }
                    } //TESTED
                } //(found docs)
            } //(success mode)
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)         
    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED
    else if (null == sourceUrl) {
        return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting)
    }

    // No short cut, go the long way round:      

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.sourceUrl_, sourceUrl);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2);
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).hint(hint).limit(1);
    // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified)
    //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky

    if (!dbc.hasNext()) { //if there is no record, return true
        ret = true;
        modifiedDate.setTime(0);
    } else { // (all docs should have same modified, though this is ~ time ordered anyway)

        BasicDBObject dbo = (BasicDBObject) dbc.iterator().next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl
        // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where)
        // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs
        //  once an RPM >=5955 is deployed this will no longer be necessary)
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {/*  w w  w.  j  av  a2 s .co  m*/
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                if (null != source.getDistributionFactor()) { // if distributed, then apply extra term
                    if ((null != source.getHarvestStatus())
                            && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) {
                        Date d = source.getHarvestStatus().getDistributedLastCompletedCycle();
                        mostRecentQuery.put(DocumentPojo._id_,
                                new BasicDBObject(DbManager.lte_, new ObjectId(d)));
                    }
                } //TESTED

                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                } //TESTED (found docs)

                //DEBUG
                //if (null != _mostRecentlyModifiedDocId)
                //   System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime()));

            } //(success mode)            
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)

    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...         
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED

    if (null == url) { // use this call with url==null to just check the modified file...
        return true;
    }

    // No short cut, go the long way round:

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, url);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1)
    boolean foundMatch = dbc.hasNext();

    if (!foundMatch) { //if there is no record, return true
        ret = true;
    } else {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match
            if (!dbc.hasNext()) { // 1 matching doc, different modified times so update
                ret = true;
            } //TESTED
            else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur)
                // (slightly slow but should be OK because not going to happen very often)               
                int nCount = dbc.count();
                query.put(DocumentPojo.modified_, modifiedDate);
                ret = !(collection.find(query).limit(1).count() == nCount);
            } //TOTEST (shouldn't ever occur)         
        } else { // Doc has same modified time so don't update
            ret = false;
        } //TESTED
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

private String isFunctionalDuplicate(SourcePojo source, LinkedList<String> candidateSourceKeys) {
    // (Ensure everything's set up)
    if (null == _sameConfigurationSources) {
        _sameConfigurationSources = new TreeSet<String>();
        _differentConfigurationSources = new TreeSet<String>();
        _sameCommunitySources = new TreeSet<String>();
    }/*from   w w  w.  j  a v  a2s .c  o  m*/
    if (null == source.getShah256Hash()) {
        source.generateShah256Hash();
    }

    // See if we've cached something:
    String returnVal = null;
    Iterator<String> it = candidateSourceKeys.iterator();
    while (it.hasNext()) {
        String sourceKey = it.next();

        if (!source.getDuplicateExistingUrls()) {
            // Check _sameCommunitySources: ignore+carry on if sourceKey isn't in here, else 
            // return sourceKey, which will treat as a non-update duplicate (non update because 
            // the update params only set if it was an update duplicate)
            if (_sameCommunitySources.contains(sourceKey)) {
                return source.getKey(); // (ie return fake source key that will cause above logic to occur)
            }
        } //TESTED

        if (sourceKey.equalsIgnoreCase(source.getKey())) {
            return sourceKey; // (the calling function will then treat it as a duplicate)
        } else if (_sameConfigurationSources.contains(sourceKey)) {
            returnVal = sourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive)
        } else if (_differentConfigurationSources.contains(sourceKey)) {
            it.remove(); // (don't need to check this source out)
        }
    } //TESTED
    boolean bMatchedInCommunity = false; // (duplication logic below)
    if ((null == returnVal) && !candidateSourceKeys.isEmpty()) {

        // Need to query the DB for this source...         
        BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_, source.getShah256Hash());
        query.put(SourcePojo.key_, new BasicDBObject(MongoDbManager.in_, candidateSourceKeys.toArray()));
        BasicDBObject fields = new BasicDBObject(SourcePojo._id_, 0);
        fields.put(SourcePojo.key_, 1);
        if (!source.getDuplicateExistingUrls()) {
            fields.put(SourcePojo.communityIds_, 1);
        }
        DBCursor dbc = DbManager.getIngest().getSource().find(query, fields);
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sSourceKey = dbo.getString(SourcePojo.key_);

            // DON'T DEDUP LOGIC:
            if (!source.getDuplicateExistingUrls()) {
                BasicDBList communities = (BasicDBList) dbo.get(SourcePojo.communityIds_);
                for (Object communityIdObj : communities) {
                    ObjectId communityId = (ObjectId) communityIdObj;
                    if (source.getCommunityIds().contains(communityId)) { // Not allowed to duplicate off this
                        _sameCommunitySources.add(sSourceKey);
                        bMatchedInCommunity = true;
                    }
                }
            } //(end "don't duplicate existing URLs logic")
              //TESTED (same community and different communities)

            if (null != sSourceKey) {
                _sameConfigurationSources.add(sSourceKey);
                returnVal = sSourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive)
            }
        }
        // Loop over config sources again to work out which keys can now be placed in the "_differentConfigurationSources" cache
        for (String sourceKey : candidateSourceKeys) {
            if (!_sameConfigurationSources.contains(sourceKey)) {
                _differentConfigurationSources.add(sourceKey);
            }
        }
    } //TESTED
    if (bMatchedInCommunity) {
        return source.getKey(); // (ie return fake source key that will cause above logic to occur)
    } else {
        return returnVal;
    }

}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java

License:Open Source License

@Override
public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) {
    if (_isDirectory) {
        if (_isShare) { // must be a zip file
            ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>();
            @SuppressWarnings("unchecked")
            Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries();
            while (entries.hasMoreElements()) {
                net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement();
                InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName());
                zipFiles.add(newFile);//from w  ww  .  j  a  v  a2 s.  co  m
            }
            return zipFiles.toArray(new InfiniteFile[zipFiles.size()]);
        } //TESTED (3.2)
        else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory"
            String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_);
            String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_);
            if (null == outputDatabase) {
                outputDatabase = "custommr";
            }
            DBCollection outColl = null;
            DBCursor dbc = null;
            if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory

                DBCollection chunks = MongoDbManager.getCollection("config", "chunks");
                StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection);
                dbc = chunks.find(new BasicDBObject("ns", ns.toString()));
                int splits = dbc.count();

                if (splits < 2) { // Nothing to do (unsharded or 1 chunk)
                    dbc.close();

                    outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                    dbc = outColl.find();
                } //TESTED (4.2)
                else { // Create one virtual dir per split
                    InfiniteFile[] virtualDirs = new InfiniteFile[splits];
                    int added = 0;
                    for (DBObject splitObj : dbc) {
                        BasicDBObject minObj = (BasicDBObject) splitObj.get("min");
                        BasicDBObject maxObj = (BasicDBObject) splitObj.get("max");
                        ObjectId minId = null;
                        try {
                            minId = (ObjectId) minObj.get("_id");
                        } catch (Exception e) {
                        } // min key..
                        ObjectId maxId = null;
                        try {
                            maxId = (ObjectId) maxObj.get("_id");
                        } catch (Exception e) {
                        } // max key..

                        //Handle current case where custom jobs are all dumped in with the wrong _id type                     
                        if ((null != minId) || (null != maxId)) {
                            if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below)

                                if (maxId.getTime() < optionalFilterDate.getTime()) {
                                    // (the "getTime()"s can overlap across chunks so we have to use minId
                                    //  and accept that we'll often deserialize 1+ extra chunk every harvest)
                                    continue;
                                }
                            } //TESTED (by hand)

                            InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId);
                            virtualDirs[added] = split;
                            added++;
                        } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand)
                    }
                    dbc.close();
                    return virtualDirs;
                } //TESTED (5.2.2, 6.2.2)
            } //TESTED
            else { // Virtual directory
                BasicDBObject query = new BasicDBObject();
                if (null != _virtualDirStartLimit) {
                    if (null != optionalFilterDate) {
                        ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                        //(zero out the inc/machine ids so this query is independent to calling service)

                        if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit)
                            query.put(MongoDbManager.gte_, altStartId);
                        } else {
                            query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                        }
                    } //TESTED (by hand)
                    else { // normal case
                        query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                    }
                } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable)
                    ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                    query.put(MongoDbManager.gte_, altStartId);
                } //TESTED (by hand)
                if (null != _virtualDirEndLimit) {
                    query.put(MongoDbManager.lt_, _virtualDirEndLimit);
                }

                outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle);
            } //TESTED (6.2.2) (doc skipping by hand)

            if (null != outColl) { // has files, create the actual file objects
                //DEBUG
                //System.out.println("CHUNK: GOT " + dbc.count());

                int docCount = dbc.count();
                if (docCount > 1 + maxDocsPerCycle) {
                    docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway)
                }
                InfiniteFile[] docs = new InfiniteFile[docCount];
                int added = 0;
                for (DBObject docObj : dbc) {
                    // (if didn't use a query then apply internal filter date by hand)
                    if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)
                            && (null != optionalFilterDate)) {
                        ObjectId docId = (ObjectId) docObj.get("_id");
                        if (optionalFilterDate.getTime() > docId.getTime()) {
                            continue;
                        }
                    } //TESTED

                    if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to)
                        docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"),
                                _virtualDirEndLimit);
                        break;
                    } else {
                        InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj);
                        docs[added] = doc;
                    } //TESTED (both cases)
                    added++;
                }
                dbc.close();
                return docs;

            } //TESTED (4.2)
        }
    } else { // can just return myself
        InfiniteFile[] retVal = new InfiniteFile[1];
        retVal[0] = this;
        return retVal;
    } //TESTED (1.2, 2.2)
    return null;
}

From source file:com.ikanow.infinit.e.harvest.HarvestController.java

License:Open Source License

private static String getDocumentContentFromWhichToDuplicate(DocumentPojo docToReplace) {
    try {/*from  ww  w  .  j  a va 2  s  .  co m*/
        // Get the full text:
        byte[] storageArray = new byte[200000];
        BasicDBObject contentQ = new BasicDBObject("url", docToReplace.getUrl());
        contentQ.put(CompressedFullTextPojo.sourceKey_,
                new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, docToReplace.getSourceKey())));
        BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
        BasicDBObject dboContent = (BasicDBObject) DbManager.getDocument().getContent().findOne(contentQ,
                fields);
        if (null != dboContent) {
            byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
            ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
            GZIPInputStream gzip = new GZIPInputStream(in);
            int nRead = 0;
            StringBuffer output = new StringBuffer();
            while (nRead >= 0) {
                nRead = gzip.read(storageArray, 0, 200000);
                if (nRead > 0) {
                    String s = new String(storageArray, 0, nRead, "UTF-8");
                    output.append(s);
                }
            }
            return output.toString();
        } else { // Will just need to-reprocess this document
            return null;
        }
    } catch (Exception e) {
        // Do nothing, just carry on
        e.printStackTrace();
    }
    return null;
}

From source file:com.ikanow.infinit.e.harvest.HarvestControllerPipeline.java

License:Open Source License

private void splitDocuments(DocumentPojo doc, SourcePojo source, SourcePipelinePojo splitter,
        List<DocumentPojo> docs) {
    try {/*  ww  w.j a v a  2  s .  c o m*/
        if (null == source.getRssConfig()) {
            source.setRssConfig(new SourceRssConfigPojo());
        }
        if (null != source.getRssConfig().getExtraUrls()) { // refreshed ready for new document
            source.getRssConfig().setExtraUrls(null);
        }

        HashMap<String, Object> jsonLookup = new HashMap<String, Object>();
        if ((null != splitter.splitter.getScriptlang())
                && splitter.splitter.getScriptlang().startsWith("automatic")) {
            // (automatic or automatic_json or automatic_xml)

            String[] args = splitter.splitter.getScript().split("\\s*,\\s*");
            Object[] objList = null;

            String field = args[0];
            if (field.startsWith(DocumentPojo.fullText_)) { // fullText, or fullText.[x] where [x] is the root value

                DocumentPojo dummyDoc = new DocumentPojo();
                dummyDoc.setFullText(doc.getFullText());
                MetadataSpecPojo dummyContent = new MetadataSpecPojo();
                dummyContent.fieldName = "extract";
                dummyContent.scriptlang = "stream";
                dummyContent.flags = "o";

                if (field.equals(DocumentPojo.fullText_)) { // fullText
                    dummyContent.script = "";
                } else {
                    dummyContent.script = field.substring(1 + DocumentPojo.fullText_.length()); //+1 for the "."
                }
                _uah.processMetadataChain(dummyDoc, Arrays.asList(dummyContent), source.getRssConfig(), null);

                BasicDBObject dummyDocDbo = (BasicDBObject) dummyDoc.toDb();
                dummyDocDbo = (BasicDBObject) dummyDocDbo.get(DocumentPojo.metadata_);
                if (null != dummyDocDbo) {
                    objList = ((Collection<?>) (dummyDocDbo.get("extract"))).toArray(); // (returns a list of strings)
                }
            } //TESTED (doc_splitter_test_auto_json, json: test3, xml: test4)
            else if (field.startsWith(DocumentPojo.metadata_)) { // field starts with "metadata."
                objList = doc.getMetadata().get(field.substring(1 + DocumentPojo.metadata_.length())); //+1 for the "."               
            } //TESTED (doc_splitter_test_auto_json, test1)
            else { // direct reference to metadata field
                objList = doc.getMetadata().get(field);
            } //TESTED (doc_splitter_test_auto_json, test2)

            if ((null != objList) && (objList.length > 0)) {
                source.getRssConfig().setExtraUrls(new ArrayList<ExtraUrlPojo>(objList.length));
                int num = 0;
                for (Object o : objList) {
                    num++;
                    ExtraUrlPojo url = new ExtraUrlPojo();
                    if ((1 == args.length) || !(o instanceof DBObject)) { // generate default URL
                        url.url = doc.getUrl() + "#" + num;
                    } //TESTED (doc_splitter_test_auto_json, test1)
                    else if (2 == args.length) { // url specified in the format <fieldname-in-dot-notation>
                        url.url = MongoDbUtil.getProperty((DBObject) o, args[1]);
                    } //TESTED (doc_splitter_test_auto_json, test2)
                    else { // url specified in format <message-format-with-{1}-{2}-etc>,<fieldname-in-dot-notation-for-1>,..
                        ArrayList<Object> cmdArgs = new ArrayList<Object>(args.length - 1); //-2 + 1 (+1 - see below)
                        cmdArgs.add("[INDEX_FROM_1_NOT_0]");
                        for (int j = 2; j < args.length; ++j) {
                            cmdArgs.add(MongoDbUtil.getProperty((DBObject) o, args[j]));
                        }
                        url.url = MessageFormat.format(args[1], cmdArgs.toArray());
                    } //TESTED (doc_splitter_test_auto_json, test3, test4)

                    if (null == url.url) { // (if we can't extract a URL then bail out)
                        continue;
                    }

                    url.title = new StringBuffer(doc.getTitle()).append(" (").append(num).append(")")
                            .toString();
                    url.fullText = o.toString();
                    source.getRssConfig().getExtraUrls().add(url);
                    if (splitter.splitter.getScriptlang().startsWith("automatic_")) { // automatic_json or automatic_xml
                        jsonLookup.put(url.url, o);
                    }
                }
            } //TESTED (doc_splitter_test_auto_json)
        } else { // normal case - run the 'follow web links' code to get the docs
            source.getRssConfig().setSearchConfig(splitter.splitter);

            FeedHarvester_searchEngineSubsystem subsys = new FeedHarvester_searchEngineSubsystem();
            subsys.generateFeedFromSearch(source, _hc, doc);
        }
        if (null != source.getRssConfig().getExtraUrls()) {
            for (ExtraUrlPojo newDocInfo : source.getRssConfig().getExtraUrls()) {
                if (null == doc.getSourceUrl()) { // (if sourceUrl != null, bypass it's because it's been generated by a file so is being deleted anyway)
                    //(note: this null check above is relied upon by the federated query engine, so don't go randomly changing it!) 

                    if (_hc.getDuplicateManager().isDuplicate_Url(newDocInfo.url, source, null)) {
                        //TODO: should handle updateCycle_secs?
                        continue;
                    }
                }
                DocumentPojo newDoc = new DocumentPojo();
                newDoc.setCreated(doc.getCreated());
                newDoc.setModified(doc.getModified());
                newDoc.setUrl(newDocInfo.url);
                newDoc.setTitle(newDocInfo.title);
                newDoc.setDescription(newDocInfo.description);
                newDoc.setFullText(newDocInfo.fullText);

                // For JSON, also create the metadata)
                if (null != splitter.splitter.getScriptlang()) {
                    if (splitter.splitter.getScriptlang().equals("automatic_json")) {
                        newDoc.addToMetadata("json", jsonLookup.get(newDoc.getUrl()));
                    } else if (splitter.splitter.getScriptlang().equals("automatic_xml")) {
                        Object obj = jsonLookup.get(newDoc.getUrl());
                        if (obj instanceof DBObject) {
                            DBObject dbo = (DBObject) obj;
                            for (String key : dbo.keySet()) {
                                Object objArray = dbo.get(key);
                                if (objArray instanceof Object[]) {
                                    newDoc.addToMetadata(key, (Object[]) objArray);
                                } else if (objArray instanceof Collection<?>) {
                                    newDoc.addToMetadata(key, ((Collection<?>) objArray).toArray());
                                }
                            }
                        } //(test4)
                    }
                } //TESTED (doc_splitter_test_auto_json, test1:json, test4:xml)

                // Published date is a bit more complex
                if (null != newDocInfo.publishedDate) {
                    try {
                        newDoc.setPublishedDate(new Date(DateUtility.parseDate(newDocInfo.publishedDate)));
                    } catch (Exception e) {
                    }
                } //TESTED (test3,test4)
                if (null == newDoc.getPublishedDate()) {
                    newDoc.setPublishedDate(doc.getPublishedDate());
                } //TESTED (test1)
                if (null == newDoc.getPublishedDate()) {
                    newDoc.setPublishedDate(doc.getCreated());
                } //TESTED (test2)
                newDoc.setTempSource(source);
                newDoc.setSource(doc.getSource());
                newDoc.setMediaType(doc.getMediaType());
                newDoc.setSourceKey(doc.getSourceKey());
                newDoc.setSourceUrl(doc.getSourceUrl()); // (otherwise won't be able to delete child docs that come from a file)
                newDoc.setCommunityId(doc.getCommunityId());
                newDoc.setDocGeo(doc.getDocGeo());
                newDoc.setIndex(doc.getIndex());

                newDoc.setSpawnedFrom(splitter);
                docs.add(newDoc);
            } //end loop over URLs
        } //TESTED
    } catch (Exception e) {
        StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
        _hc.getHarvestStatus().logMessage(errMessage.toString(), true);
    } //TESTED (test4)

}

From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java

License:Open Source License

@SuppressWarnings({ "unchecked", "rawtypes" })
public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation)
        throws IOException, SAXException, ParserConfigurationException {
    StringWriter xml = new StringWriter();
    String outputCollection = job.outputCollectionTemp;// (non-append mode) 
    if ((null != job.appendResults) && job.appendResults)
        outputCollection = job.outputCollection; // (append mode, write directly in....)
    else if (null != job.incrementalMode)
        job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode)

    createConfigXML(xml, job.jobtitle, job.inputCollection,
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS),
            job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper,
            job.reducer, job.combiner,//from w  w w .j a  v a2 s. c o  m
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY),
            job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode,
            job.submitterID, job.selfMerge, job.outputCollection, job.appendResults);

    ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();

    URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
            savedClassLoader);
    Thread.currentThread().setContextClassLoader(child);

    // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable
    boolean dataModelLoaded = true;
    try {
        URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
                null);
        try {
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest);
        } catch (ClassNotFoundException e2) {
            //(this is fine, will use the cached version)
            dataModelLoaded = false;
        }
        if (dataModelLoaded)
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest);
    } catch (ClassNotFoundException e1) {
        throw new RuntimeException(
                "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards");
    }

    // Now load the XML into a configuration object: 
    Configuration config = new Configuration();
    // Add the client configuration overrides:
    if (!bLocalMode) {
        String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/";
        config.addResource(new Path(hadoopConfigPath + "core-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "mapred-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml"));
    } //TESTED

    try {
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes()));
        NodeList nList = doc.getElementsByTagName("property");

        for (int temp = 0; temp < nList.getLength(); temp++) {
            Node nNode = nList.item(temp);
            if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                Element eElement = (Element) nNode;
                String name = getTagValue("name", eElement);
                String value = getTagValue("value", eElement);
                if ((null != name) && (null != value)) {
                    config.set(name, value);
                }
            }
        }
    } catch (Exception e) {
        throw new IOException(e.getMessage());
    }

    // Some other config defaults:
    // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config)
    config.set("mapred.map.tasks.speculative.execution", "false");
    config.set("mapred.reduce.tasks.speculative.execution", "false");
    // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera)

    // Now run the JAR file
    try {
        BasicDBObject advancedConfigurationDbo = null;
        try {
            advancedConfigurationDbo = (null != job.query)
                    ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query))
                    : (new BasicDBObject());
        } catch (Exception e) {
            advancedConfigurationDbo = new BasicDBObject();
        }
        boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable;
        if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) {
            throw new RuntimeException(
                    "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead.");
        }

        config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
        if (bLocalMode) { // local job tracker and FS mode
            config.set("mapred.job.tracker", "local");
            config.set("fs.default.name", "local");
        } else {
            if (bTestMode) { // run job tracker locally but FS mode remotely
                config.set("mapred.job.tracker", "local");
            } else { // normal job tracker
                String trackerUrl = HadoopUtils.getXMLProperty(
                        props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker");
                config.set("mapred.job.tracker", trackerUrl);
            }
            String fsUrl = HadoopUtils.getXMLProperty(
                    props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
            config.set("fs.default.name", fsUrl);
        }
        if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves
            Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.data_model.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
            jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.processing.custom.library.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
        } //TESTED

        // Debug scripts (only if they exist), and only in non local/test mode
        if (!bLocalMode && !bTestMode) {

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_map_error_handler.sh", config);
                config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_reduce_error_handler.sh", config);
                config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

        } //TODO (???): TOTEST

        // (need to do these 2 things here before the job is created, at which point the config class has been copied across)
        //1)
        Class<?> mapperClazz = Class.forName(job.mapper, true, child);
        if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook
            ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz
                    .newInstance();
            preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode));
        } //TESTED
          //2)
        if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {
            // Need to download the GridFSZip file
            try {
                Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/",
                        "GridFSZipFile.jar", config);
                DistributedCache.addFileToClassPath(jarToCache, config);
            } catch (Throwable t) {
            } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!)            
        }

        if (job.inputCollection.equals("records")) {

            InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo);

            //(won't run under 0.19 so running with "records" should cause all sorts of exceptions)

        } //TESTED (by hand)         

        if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
            config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
        }

        // Manually specified caches
        List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"),
                job, config, props_custom);

        Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)
        try {

            if (null != localJarCaches) {
                if (bLocalMode || bTestMode) {
                    Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class });
                    method.setAccessible(true);
                    method.invoke(child, localJarCaches.toArray());

                } //TOTEST (tested logically)
            }
            Class<?> classToLoad = Class.forName(job.mapper, true, child);
            hj.setJarByClass(classToLoad);

            if (job.inputCollection.equalsIgnoreCase("filesystem")) {
                String inputPath = null;
                try {
                    inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    if (!inputPath.endsWith("/")) {
                        inputPath = inputPath + "/";
                    }
                } catch (Exception e) {
                }
                if (null == inputPath) {
                    throw new RuntimeException("Must specify 'file.url' if reading from filesystem.");
                }
                inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath);

                InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive)
                InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB)
                InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child));
            } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {

                String[] oidStrs = null;
                try {
                    String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)");
                    Matcher m = oidExtractor.matcher(inputPath);
                    if (m.find()) {
                        oidStrs = m.group(1).split("\\s*,\\s*");

                    } else {
                        throw new RuntimeException(
                                "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath);
                    }
                    InfiniteHadoopUtils.authenticateShareList(job, oidStrs);
                } catch (Exception e) {
                    throw new RuntimeException(
                            "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e);
                }

                hj.getConfiguration().setStrings("mapred.input.dir", oidStrs);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child));
            } else if (job.inputCollection.equals("records")) {
                hj.setInputFormatClass((Class<? extends InputFormat>) Class
                        .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child));
            } else {
                if (esMode) {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat",
                            true, child));
                } else {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child));
                }
            }
            if ((null != job.exportToHdfs) && job.exportToHdfs) {

                //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?)

                Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom);

                if ((null != job.outputKey) && (null != job.outputValue)
                        && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                        && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
                    // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text)
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
                            .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child));
                    TextOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
                else {
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child));
                    SequenceFileOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
            } else { // normal case, stays in MongoDB
                hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child));
            }
            hj.setMapperClass((Class<? extends Mapper>) mapperClazz);
            String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null);
            if (null != mapperOutputKeyOverride) {
                hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride));
            } //TESTED 

            String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null);
            if (null != mapperOutputValueOverride) {
                hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride));
            } //TESTED 

            if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null")
                    && !job.reducer.equalsIgnoreCase("none")) {
                hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child));
                // Variable reducers:
                if (null != job.query) {
                    try {
                        hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1));
                    } catch (Exception e) {
                        try {
                            // (just check it's not a string that is a valid int)
                            hj.setNumReduceTasks(
                                    Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1")));
                        } catch (Exception e2) {
                        }
                    }
                } //TESTED
            } else {
                hj.setNumReduceTasks(0);
            }
            if ((null != job.combiner) && !job.combiner.startsWith("#")
                    && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) {
                hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child));
            }
            hj.setOutputKeyClass(Class.forName(job.outputKey, true, child));
            hj.setOutputValueClass(Class.forName(job.outputValue, true, child));

            hj.setJobName(job.jobtitle);
            currJobName = job.jobtitle;
        } catch (Error e) { // (messing about with class loaders = lots of chances for errors!)
            throw new RuntimeException(e.getMessage(), e);
        }
        if (bTestMode || bLocalMode) {
            hj.submit();
            currThreadId = null;
            Logger.getRootLogger().addAppender(this);
            currLocalJobId = hj.getJobID().toString();
            currLocalJobErrs.setLength(0);
            while (!hj.isComplete()) {
                Thread.sleep(1000);
            }
            Logger.getRootLogger().removeAppender(this);
            if (hj.isSuccessful()) {
                if (this.currLocalJobErrs.length() > 0) {
                    return "local_done: " + this.currLocalJobErrs.toString();
                } else {
                    return "local_done";
                }
            } else {
                return "Error: " + this.currLocalJobErrs.toString();
            }
        } else {
            hj.submit();
            String jobId = hj.getJobID().toString();
            return jobId;
        }
    } catch (Exception e) {
        e.printStackTrace();
        Thread.currentThread().setContextClassLoader(savedClassLoader);
        return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e);
    } finally {
        Thread.currentThread().setContextClassLoader(savedClassLoader);
    }
}