Example usage for com.mongodb BasicDBObject getString

List of usage examples for com.mongodb BasicDBObject getString

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject getString.

Prototype

public String getString(final String key) 

Source Link

Document

Returns the value of a field as a string

Usage

From source file:com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap.java

License:Apache License

public static void mapToApi(BasicDBObject doc) {
    // 1. (doc_index field)
    doc.remove(DocumentPojo.index_);/*from w w  w.ja  v  a2 s.c  om*/
    // 2. (source title)
    String tmp = doc.getString(DocumentPojo.source_);
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        array.add(tmp);
        doc.put(DocumentPojo.source_, array);
    }
    // 3. (source key)
    tmp = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_));
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        array.add(tmp);
        doc.put(DocumentPojo.sourceKey_, array);
    }
    // 4. (media type)
    tmp = doc.getString(DocumentPojo.mediaType_);
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        array.add(tmp);
        doc.put(DocumentPojo.mediaType_, array);
    }

}

From source file:com.ikanow.infinit.e.data_model.utils.MongoTransactionLock.java

License:Apache License

protected synchronized boolean getToken() {
    DBCollection cachedCollection = _collections.get();

    boolean bDefinitelyHaveControl = false;
    String hostname = null;//  w  w  w. jav  a2  s.c o  m
    String oneUp = null;

    // 1] Get Lock Object (create one an assert control if it doesn't exist)

    BasicDBObject lockObj = (BasicDBObject) cachedCollection.findOne();
    if (null == lockObj) { // Currently the DB is unlocked
        hostname = getHostname();
        oneUp = Long.toString(1000000L * (new Date().getTime() % 10000));
        // (ie a randomish start number)

        lockObj = new BasicDBObject(id_, _lockId);
        lockObj.put(hostname_, hostname);
        lockObj.put(oneUp_, oneUp);
        lockObj.put(lastUpdated_, new Date());

        //logger.debug("Creating a new aggregation lock object: " + lockObj.toString());

        try {
            cachedCollection.insert(lockObj, WriteConcern.SAFE);
            // (will fail if another harvester gets there first)
            bDefinitelyHaveControl = true;
        } catch (Exception e) { // Someone else has created it in the meantime
            lockObj = (BasicDBObject) cachedCollection.findOne();
        }

    } //TESTED

    // (So by here lockObj is always non-null)

    // 2] Do I have control?

    if (bDefinitelyHaveControl) {
        _bHaveControl = true;
        _nLastCheck = 0;
    } else {
        hostname = lockObj.getString(hostname_);
        oneUp = lockObj.getString(oneUp_);
        _bHaveControl = getHostname().equals(hostname);
    }
    // 3] If not, has the lock object been static for >= 1 minute

    if (!_bHaveControl) { // Don't currently have control
        long nNow = new Date().getTime();
        if (0 == _nLastCheck) {
            _nLastCheck = nNow;
        }

        if ((nNow - _nLastCheck) > 60000) { // re-check every minute
            if (_savedHostname.equals(hostname) && _savedOneUp.equals(oneUp)) { // Now I have control...
                //logger.debug("I am taking control from: " + hostname + ", " + oneUp);

                if (updateToken(true)) { // Try to grab control:                  
                    _bHaveControl = true;
                } else { // (else someone else snagged control just carry on)
                    _nLastCheck = 0; // (reset clock again anyway)
                }

            } //(if lock has remained static)
        } //(end if >=1 minutes has passed)

    } //(end if have don't have control)

    // 4] Update saved state 

    _savedHostname = hostname;
    _savedOneUp = oneUp;

    return _bHaveControl;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

private String isFunctionalDuplicate(SourcePojo source, LinkedList<String> candidateSourceKeys) {
    // (Ensure everything's set up)
    if (null == _sameConfigurationSources) {
        _sameConfigurationSources = new TreeSet<String>();
        _differentConfigurationSources = new TreeSet<String>();
        _sameCommunitySources = new TreeSet<String>();
    }/*  w  w w  .ja  v  a  2  s .  c  om*/
    if (null == source.getShah256Hash()) {
        source.generateShah256Hash();
    }

    // See if we've cached something:
    String returnVal = null;
    Iterator<String> it = candidateSourceKeys.iterator();
    while (it.hasNext()) {
        String sourceKey = it.next();

        if (!source.getDuplicateExistingUrls()) {
            // Check _sameCommunitySources: ignore+carry on if sourceKey isn't in here, else 
            // return sourceKey, which will treat as a non-update duplicate (non update because 
            // the update params only set if it was an update duplicate)
            if (_sameCommunitySources.contains(sourceKey)) {
                return source.getKey(); // (ie return fake source key that will cause above logic to occur)
            }
        } //TESTED

        if (sourceKey.equalsIgnoreCase(source.getKey())) {
            return sourceKey; // (the calling function will then treat it as a duplicate)
        } else if (_sameConfigurationSources.contains(sourceKey)) {
            returnVal = sourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive)
        } else if (_differentConfigurationSources.contains(sourceKey)) {
            it.remove(); // (don't need to check this source out)
        }
    } //TESTED
    boolean bMatchedInCommunity = false; // (duplication logic below)
    if ((null == returnVal) && !candidateSourceKeys.isEmpty()) {

        // Need to query the DB for this source...         
        BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_, source.getShah256Hash());
        query.put(SourcePojo.key_, new BasicDBObject(MongoDbManager.in_, candidateSourceKeys.toArray()));
        BasicDBObject fields = new BasicDBObject(SourcePojo._id_, 0);
        fields.put(SourcePojo.key_, 1);
        if (!source.getDuplicateExistingUrls()) {
            fields.put(SourcePojo.communityIds_, 1);
        }
        DBCursor dbc = DbManager.getIngest().getSource().find(query, fields);
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sSourceKey = dbo.getString(SourcePojo.key_);

            // DON'T DEDUP LOGIC:
            if (!source.getDuplicateExistingUrls()) {
                BasicDBList communities = (BasicDBList) dbo.get(SourcePojo.communityIds_);
                for (Object communityIdObj : communities) {
                    ObjectId communityId = (ObjectId) communityIdObj;
                    if (source.getCommunityIds().contains(communityId)) { // Not allowed to duplicate off this
                        _sameCommunitySources.add(sSourceKey);
                        bMatchedInCommunity = true;
                    }
                }
            } //(end "don't duplicate existing URLs logic")
              //TESTED (same community and different communities)

            if (null != sSourceKey) {
                _sameConfigurationSources.add(sSourceKey);
                returnVal = sSourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive)
            }
        }
        // Loop over config sources again to work out which keys can now be placed in the "_differentConfigurationSources" cache
        for (String sourceKey : candidateSourceKeys) {
            if (!_sameConfigurationSources.contains(sourceKey)) {
                _differentConfigurationSources.add(sourceKey);
            }
        }
    } //TESTED
    if (bMatchedInCommunity) {
        return source.getKey(); // (ie return fake source key that will cause above logic to occur)
    } else {
        return returnVal;
    }

}

From source file:com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester_searchEngineSubsystem.java

License:Open Source License

public void generateFeedFromSearch(SourcePojo src, HarvestContext context, DocumentPojo docToSplit)
        throws Exception {

    if (context.isStandalone()) {
        maxDocsPerCycle = context.getStandaloneMaxDocs();
    }/* w  w  w  .  j  a va  2 s .  c o m*/
    // otherwise get everything and worry about max docs in the main feed harvester
    // (probably slightly less efficient than checking duplicates here, but much simpler, can 
    //  always change it later)

    String savedUrl = src.getUrl();
    SourceRssConfigPojo feedConfig = src.getRssConfig();
    SourceSearchFeedConfigPojo searchConfig = feedConfig.getSearchConfig();
    String savedProxyOverride = feedConfig.getProxyOverride();
    if ((null == feedConfig) || (null == searchConfig)) {
        return;
    }
    String savedTextExtractor = src.useTextExtractor();
    String savedFeatureExtractor = src.useExtractor();
    LinkedHashMap<String, String> savedExtractorOptions = src.getExtractorOptions();
    if ((null != searchConfig.getAuthExtractor()) && searchConfig.getAuthExtractor().equals("none")) {
        searchConfig.setAuthExtractor(null);
    }
    LinkedHashMap<String, Object[]> authenticationMeta = new LinkedHashMap<String, Object[]>();

    // Now allowed to stop paginating on duplicate in success_iteration/error cases
    if ((null == src.getHarvestStatus())
            || (HarvestEnum.success != src.getHarvestStatus().getHarvest_status())) {
        searchConfig.setStopPaginatingOnDuplicate(false);
    } //TESTED      

    UnstructuredAnalysisConfigPojo savedUAHconfig = src.getUnstructuredAnalysisConfig(); // (can be null)
    String savedUserAgent = feedConfig.getUserAgent();
    LinkedHashMap<String, String> savedHttpFields = feedConfig.getHttpFields();
    Integer savedWaitTimeOverride_ms = feedConfig.getWaitTimeOverride_ms();

    // Create a deduplication set to ensure URLs derived from the search pages don't duplicate the originals
    // (and also derived URLs)
    HashSet<String> dedupSet = new HashSet<String>();
    if (null != src.getRssConfig().getExtraUrls()) {
        Iterator<ExtraUrlPojo> itDedupUrls = src.getRssConfig().getExtraUrls().iterator();
        while (itDedupUrls.hasNext()) {
            ExtraUrlPojo itUrl = itDedupUrls.next();
            if (null != itUrl.title) {
                String dedupUrl = itUrl.url;
                dedupSet.add(dedupUrl);
                if (maxDocsPerCycle != Integer.MAX_VALUE) {
                    maxDocsPerCycle++; // (ensure we get as far as adding these)
                }
            }
        }
    } //TESTED

    Iterator<ExtraUrlPojo> itUrls = null;

    // Spider parameters used in conjunction with itUrls
    List<ExtraUrlPojo> iteratingList = null;
    List<ExtraUrlPojo> waitingList = null;
    int nIteratingDepth = 0;

    // (ie no URL specified, so using extra URLs as search URLs - and optionally as real URLs also)
    if ((null == savedUrl) && (null != src.getRssConfig().getExtraUrls())
            && !src.getRssConfig().getExtraUrls().isEmpty()) {
        // Spider logic:
        iteratingList = src.getRssConfig().getExtraUrls();
        // (end spidering logic)

        itUrls = iteratingList.iterator();
        src.getRssConfig().setExtraUrls(new LinkedList<ExtraUrlPojo>());
        // (ie overwrite the original list)
    } //TESTED

    for (;;) { // The logic for this loop can vary...
        if (dedupSet.size() >= maxDocsPerCycle) {
            break;
        }
        String currTitle = null;
        String currFullText = null;
        String currDesc = null;

        if (null != itUrls) {

            ExtraUrlPojo urlPojo = itUrls.next();
            savedUrl = urlPojo.url;
            if (0 == nIteratingDepth) {
                if (null != urlPojo.title) { // Also harvest this
                    src.getRssConfig().getExtraUrls().add(urlPojo);
                    if (maxDocsPerCycle != Integer.MAX_VALUE) {
                        maxDocsPerCycle--; // (now added, can remove)
                    }
                }
            }
            currTitle = urlPojo.title;
            currDesc = urlPojo.description;
            currFullText = urlPojo.fullText;
        } //TESTED

        try { // If we error out, we're probably going to abandon the entire search

            // We're going to loop over pages

            // Apply the regex to the URL for pagination, part 1

            int nResultOffset = 0;
            int nMaxPages = 1;
            Pattern pageChangeRegex = null;
            Matcher pageChangeRegexMatcher = null;
            if (null != feedConfig.getSearchConfig().getPageChangeRegex()) {
                pageChangeRegex = Pattern.compile(feedConfig.getSearchConfig().getPageChangeRegex(),
                        Pattern.CASE_INSENSITIVE);
                pageChangeRegexMatcher = pageChangeRegex.matcher(savedUrl);
                nMaxPages = feedConfig.getSearchConfig().getNumPages();

                if (pageChangeRegexMatcher.find()) {
                    String group = pageChangeRegexMatcher.group(1);
                    if (null != group) {
                        try {
                            nResultOffset = Integer.parseInt(group);
                        } catch (Exception e) {
                        } // just carry on
                    }
                } else { // URL doesn't match
                    pageChangeRegexMatcher = null;
                } //TESTED

            } //TESTED

            // Page limit check (see also nLinksFound/nCurrDedupSetSize inside loop)
            int nMinLinksToExitLoop = 10; // (use to check one iteration past the point at which nothing happens)

            // If checking vs duplicates then have a flag to exit (note: only applies to the current URL)
            boolean stopPaginating = false;
            boolean stopLinkFollowing = false;
            // (if set to stop paginating but only link following occurs, assume this is treated like pagination, eg nextUrl sort of thing)

            for (int nPage = 0; nPage < nMaxPages; ++nPage) {
                if ((dedupSet.size() >= maxDocsPerCycle) || stopPaginating) {
                    if (dedupSet.size() >= maxDocsPerCycle) {
                        src.setReachedMaxDocs();
                    }
                    break;
                }
                // Will use this to check if we reached a page limit (eg some sites will just repeat the same page over and over again)
                int nLinksFound = 0;
                int nCurrDedupSetSize = dedupSet.size();

                String url = savedUrl;

                // Apply the regex to the URL for pagination, part 2

                if ((null != pageChangeRegex)
                        && (null != feedConfig.getSearchConfig().getPageChangeReplace())) {
                    int nResultStart = nPage * feedConfig.getSearchConfig().getNumResultsPerPage()
                            + nResultOffset;
                    String replace = feedConfig.getSearchConfig().getPageChangeReplace().replace("$1",
                            Integer.toString(nResultStart));

                    if (null == pageChangeRegexMatcher) {
                        url += replace;
                    } else {
                        url = pageChangeRegexMatcher.replaceFirst(replace);
                    }
                } //TESTED

                //DEBUG
                //System.out.println("URL=" + url);

                // Create a custom UAH object to fetch and parse the search results

                UnstructuredAnalysisConfigPojo dummyUAHconfig = new UnstructuredAnalysisConfigPojo();
                if (null == feedConfig.getSearchConfig().getScriptflags()) { // Set flags if necessary
                    if (null == feedConfig.getSearchConfig().getExtraMeta()) {
                        feedConfig.getSearchConfig().setScriptflags("dt");
                    } else {
                        feedConfig.getSearchConfig().setScriptflags("dtm");
                    }
                }
                if (null != feedConfig.getSearchConfig().getExtraMeta()) {
                    dummyUAHconfig.CopyMeta(feedConfig.getSearchConfig().getExtraMeta());
                    // Legacy -> Pipeline port
                    for (metaField extraMeta : dummyUAHconfig.getMeta()) {
                        if (null == extraMeta.context) { // mandatory in legacy, discarded in pipeline!
                            extraMeta.context = Context.First;
                        }
                    }
                }
                dummyUAHconfig.setScript(feedConfig.getSearchConfig().getGlobals());
                dummyUAHconfig.AddMetaField("searchEngineSubsystem", Context.All,
                        feedConfig.getSearchConfig().getScript(), "javascript",
                        feedConfig.getSearchConfig().getScriptflags());
                src.setUnstructuredAnalysisConfig(dummyUAHconfig);
                if (null != searchConfig.getProxyOverride()) {
                    feedConfig.setProxyOverride(searchConfig.getProxyOverride());
                }
                if (null != searchConfig.getUserAgent()) {
                    feedConfig.setUserAgent(searchConfig.getUserAgent());
                }
                if (null != searchConfig.getHttpFields()) {
                    feedConfig.setHttpFields(searchConfig.getHttpFields());
                }
                if (null != searchConfig.getWaitTimeBetweenPages_ms()) {
                    // Web etiquette: don't hit the same site too often
                    // (applies this value to sleeps inside UAH.executeHarvest)
                    feedConfig.setWaitTimeOverride_ms(searchConfig.getWaitTimeBetweenPages_ms());
                }
                //TESTED (including RSS-level value being written back again and applied in SAH/UAH code)

                DocumentPojo searchDoc = docToSplit;
                Object[] savedMeta = null;
                if (null == searchDoc) {
                    searchDoc = new DocumentPojo();
                    // Required terms:
                    searchDoc.setUrl(url);
                    searchDoc.setScore((double) nIteratingDepth); // (spidering param)
                    // Handy terms
                    if (null != src.getHarvestStatus()) {
                        searchDoc.setModified(src.getHarvestStatus().getHarvested()); // the last time the source was harvested - can use to determine how far back to go
                    }
                    // If these exist (they won't normally), fill them:
                    searchDoc.setFullText(currFullText);
                    searchDoc.setDescription(currDesc);
                    searchDoc.setTitle(currTitle);
                } //TOTEST
                else if (null != searchDoc.getMetadata()) {
                    savedMeta = searchDoc.getMetadata().remove("searchEngineSubsystem");
                    // (this is normally null)
                } //TOTEST
                UnstructuredAnalysisHarvester dummyUAH = new UnstructuredAnalysisHarvester();
                boolean bMoreDocs = (nPage < nMaxPages - 1);
                Object[] searchResults = null;
                try {
                    if (null != searchConfig.getAuthExtractor()) {
                        src.setUseTextExtractor(searchConfig.getAuthExtractor());
                        src.setExtractorOptions(searchConfig.getAuthExtractorOptions());

                        LinkedHashMap<String, Object[]> savedAuthMeta = searchDoc.getMetadata();
                        try {
                            searchDoc.setMetadata(authenticationMeta);
                            HarvestController hc = (HarvestController) context;
                            ArrayList<DocumentPojo> docWrapper = new ArrayList<DocumentPojo>(1);
                            searchDoc.setTempSource(src);
                            docWrapper.add(searchDoc);
                            hc.extractTextAndEntities(docWrapper, src, false, true);
                            authenticationMeta = searchDoc.getMetadata();

                            if (null != authenticationMeta) {
                                if (null == feedConfig.getHttpFields()) {
                                    feedConfig.setHttpFields(new LinkedHashMap<String, String>());
                                }
                                for (Map.Entry<String, Object[]> kv : authenticationMeta.entrySet()) {
                                    if (1 == kv.getValue().length) {
                                        if (kv.getValue()[0] instanceof String) {
                                            feedConfig.getHttpFields().put(kv.getKey(),
                                                    kv.getValue()[0].toString());
                                        }
                                    }
                                }
                            }
                        } catch (Throwable t) {
                            //(do nothing)
                        } finally {
                            searchDoc.setMetadata(savedAuthMeta);

                            src.setUseTextExtractor(savedTextExtractor); // (will be null in pipeline cases - can cause odd results in non-pipeline cases, but is consistent with older behavior, which seems safest)
                            src.setExtractorOptions(savedExtractorOptions);
                        }
                    } //TESTED (if applying extractor options)
                    dummyUAH.executeHarvest(context, src, searchDoc, false, bMoreDocs);
                    // (the leading false means that we never sleep *before* the query, only after)
                    searchResults = searchDoc.getMetaData().get("searchEngineSubsystem");
                } finally {
                    if (null != savedMeta) { // (this is really obscure but handle the case where someone has created this meta field already) 
                        searchDoc.getMetadata().put("searchEngineSubsystem", savedMeta);
                    } else if ((null != searchDoc) && (null != searchDoc.getMetadata())) {
                        searchDoc.getMetadata().remove("searchEngineSubsystem");
                    }
                } //TOTEST

                //DEBUG
                //System.out.println("NEW DOC MD: " + new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(searchDoc.getMetadata()));

                // Create extraUrl entries from the metadata

                if ((null != searchResults) && (searchResults.length > 0)) {
                    for (Object searchResultObj : searchResults) {
                        try {
                            BasicDBObject bsonObj = (BasicDBObject) searchResultObj;

                            // 3 fields: url, title, description(=optional)
                            String linkUrl = bsonObj.getString(DocumentPojo.url_);

                            nLinksFound++;
                            if (!dedupSet.contains(linkUrl)) {
                                dedupSet.add(linkUrl);

                                String linkTitle = bsonObj.getString(DocumentPojo.title_);
                                String linkDesc = bsonObj.getString(DocumentPojo.description_);
                                String linkPubDate = bsonObj.getString(DocumentPojo.publishedDate_);
                                String linkFullText = bsonObj.getString(DocumentPojo.fullText_);
                                String spiderOut = bsonObj.getString("spiderOut");

                                if (null != linkUrl) {
                                    SourceRssConfigPojo.ExtraUrlPojo link = new SourceRssConfigPojo.ExtraUrlPojo();
                                    link.url = linkUrl;
                                    link.title = linkTitle;
                                    link.description = linkDesc;
                                    link.publishedDate = linkPubDate;
                                    link.fullText = linkFullText;
                                    if (!stopLinkFollowing && (null != itUrls) && (null != spiderOut)
                                            && spiderOut.equalsIgnoreCase("true")) {
                                        // In this case, add it back to the original list for chained processing

                                        if (null == waitingList) {
                                            waitingList = new LinkedList<ExtraUrlPojo>();
                                        }
                                        waitingList.add(link);
                                        // (can't result in an infinite loop like this because we check 
                                        //  dedupSet.size() and only allow links not already in dedupSet)

                                    } //TESTED

                                    if (null != linkTitle) {

                                        boolean isDuplicate = false;
                                        if (!stopPaginating && searchConfig.getStopPaginatingOnDuplicate()) {
                                            // Quick duplicate check (full one gets done later)
                                            isDuplicate = context.getDuplicateManager().isDuplicate_Url(linkUrl,
                                                    src, null);
                                        } //TESTED                                 
                                        if (!isDuplicate) {
                                            if (null == feedConfig.getExtraUrls()) {
                                                feedConfig.setExtraUrls(
                                                        new ArrayList<ExtraUrlPojo>(searchResults.length));
                                            }
                                            feedConfig.getExtraUrls().add(link);
                                        } else {
                                            stopPaginating = true;
                                            if (null == feedConfig.getSearchConfig().getPageChangeRegex()) {
                                                stopLinkFollowing = true;
                                            } //TESTED                                 
                                        } //TESTED
                                    }

                                }
                            } //(end if URL not already found)
                        } catch (Exception e) {
                            // (just carry on)
                            //DEBUG
                            //e.printStackTrace();
                        }
                    }
                } //TESTED
                else if (0 == nPage) { //returned no links, log an error if this is page 1 and one has been saved
                    Object[] onError = searchDoc.getMetaData().get("_ONERROR_");
                    if ((null != onError) && (onError.length > 0) && (onError[0] instanceof String)
                            && !(((String) (onError[0]))).isEmpty()) {
                        throw new ExtractorSourceLevelTransientException(
                                "generateFeedFromSearch: _ONERROR_: " + onError[0]);
                    }
                } //TESTED

                if (context.isStandalone()) { // debug mode, will display some additional logging
                    Object[] onDebug = searchDoc.getMetaData().get("_ONDEBUG_");
                    if ((null != onDebug) && (onDebug.length > 0)) {
                        for (Object debug : onDebug) {
                            if (debug instanceof String) {
                                context.getHarvestStatus().logMessage("_ONDEBUG_: " + (String) debug, true);
                            } else {
                                context.getHarvestStatus().logMessage(
                                        "_ONDEBUG_: " + new com.google.gson.Gson().toJson(debug), true);
                            }
                        }
                    }
                } //TESTED

                // PAGINGATION BREAK LOGIC:
                // 1: All the links are duplicates of links already in the DB
                // 2: No new links from last page

                // LOGIC CASE 1: (All the links are duplicates of links already in the DB)

                //(already handled above)

                // LOGIC CASE 2: (No new links from last page)

                //DEBUG
                //System.out.println("LINKS_SIZE=" + feedConfig.getExtraUrls().size());
                //System.out.println("LINKS=\n"+new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(feedConfig.getExtraUrls()));

                if (dedupSet.size() == nCurrDedupSetSize) { // All links were duplicate
                    //DEBUG
                    //System.out.println("FOUND " + nLinksFound + " vs " + nMinLinksToExitLoop + " duplicate URLs (" + nCurrDedupSetSize + ")");
                    if (nLinksFound >= nMinLinksToExitLoop) { // (at least 10 found so insta-quit)
                        break;
                    } else { // (fewer than 10 found - includ
                        nMinLinksToExitLoop = 0; // (also handles the no links found case)
                    }
                } //TESTED
                else {
                    nMinLinksToExitLoop = 10; // (reset)
                } //TESTED

            } // end loop over pages

        } catch (Exception e) {
            //DEBUG
            //e.printStackTrace();

            if ((null == dedupSet) || dedupSet.isEmpty()) {
                throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: " + e.getMessage());
            } else {
                throw new ExtractorDocumentLevelException("generateFeedFromSearch: " + e.getMessage());
            }
            // (don't log since these errors will appear in the log under the source, ie more usefully)
        } //TESTED
        finally {
            // Fix any temp changes we made to the source
            src.setUnstructuredAnalysisConfig(savedUAHconfig);
            feedConfig.setUserAgent(savedUserAgent);
            feedConfig.setHttpFields(savedHttpFields);
            feedConfig.setWaitTimeOverride_ms(savedWaitTimeOverride_ms);
            feedConfig.setProxyOverride(savedProxyOverride);

            src.setUseTextExtractor(savedTextExtractor);
            src.setUseExtractor(savedFeatureExtractor);
            src.setExtractorOptions(savedExtractorOptions);
        }
        if (null == itUrls) {
            break;
        } else if (!itUrls.hasNext()) {
            if (null != waitingList) {

                // Spider logic:
                if (null == searchConfig.getMaxDepth()) {
                    searchConfig.setMaxDepth(2); // (default max depth is 2 hops, ie original document, link, link from link)
                }
                nIteratingDepth++;
                if (nIteratingDepth > searchConfig.getMaxDepth()) {
                    break;
                }
                itUrls = waitingList.iterator();
                waitingList = null;
                // (end spider logic)

            } //TESTED
            else
                break;

        } //TESTED x2

    } //(end loop over candidate URLs)

}

From source file:com.ikanow.infinit.e.processing.generic.aggregation.AssociationBackgroundAggregationManager.java

License:Open Source License

private boolean getToken() {

    boolean bHaveControl = false;

    final String id_ = "_id";
    final String hostname_ = "hostname";
    final String oneUp_ = "1up";

    String savedHostname = "";
    String savedOneUp = "";

    while (!_bKillMe) {
        // Get IP address:
        BasicDBObject query = new BasicDBObject(id_, _assocLockId);
        BasicDBObject lockObj = (BasicDBObject) MongoDbManager.getFeature().getAggregationLock().findOne(query);
        if (null == lockObj) {
            lockObj = new BasicDBObject(id_, _assocLockId);
            lockObj.put(hostname_, getHostname());
            lockObj.put(oneUp_, Long.toString(1000000L * (new Date().getTime() % 10000)));
            // (ie a randomish start number)

            MongoDbManager.getFeature().getAggregationLock().insert(lockObj);
            // (will fail if another harvester gets there first)

            logger.debug("Creating a new aggregation lock object: " + lockObj.toString());

            lockObj = (BasicDBObject) MongoDbManager.getFeature().getAggregationLock().findOne();
        } //TESTED

        // So by here lockObj is always non-null

        // Do I have control?
        String hostname = lockObj.getString(hostname_);
        String oneUp = lockObj.getString(oneUp_);

        bHaveControl = getHostname().equals(hostname);

        if (!bHaveControl) { // Don't currently have control            
            if (savedHostname.equals(hostname) && savedOneUp.equals(oneUp)) { // Now I have control...
                logger.debug("I am taking control from: " + hostname + ", " + oneUp);
                bHaveControl = true;//from  w  w  w. j a  va 2s .c  om
                _nGrabbedControl++;
            } else if (getHostname().equals(savedHostname)) { // I had control of this last time I checked
                logger.debug("Lost control to: " + hostname);
                _nLostControl++;
            }
        } else {
            logger.debug("I have control already: " + hostname);
        } //TESTED

        if (bHaveControl) {
            savedHostname = hostname;
            long nOneUp = Long.parseLong(oneUp);
            lockObj.put(hostname_, getHostname());
            lockObj.put(oneUp_, Long.toString(nOneUp + 1));
            MongoDbManager.getFeature().getAggregationLock().save(lockObj);
            return true;
        } //TESTED
        else { // Save info and sleep for 60s 
            savedHostname = hostname;
            savedOneUp = oneUp;
            logger.debug("Saving state and sleeping: " + savedHostname + ", " + savedOneUp);

            for (int i = 0; (i < 6) && !_bKillMe; ++i) {
                try {
                    Thread.sleep(10000);
                } catch (InterruptedException e) {
                }
            }
        } //TESTED

    } // end loop forever 

    return bHaveControl;
}

From source file:com.images3.data.impl.MongoDBObjectMapper.java

License:Apache License

public PageCursor mapToPageCursor(BasicDBObject source) {
    Page page = new Page(source.getInt("start"), source.getInt("size"));
    return new PageCursor(source.getString("id"), source.getString("previousPageCursorId"), page,
            new Date(source.getLong("creationTime")));
}

From source file:com.images3.data.impl.MongoDBObjectMapper.java

License:Apache License

public ImagePlantOS mapToImagePlantOS(BasicDBObject source) {
    int maximumImageSize = MaximumImageSize.UNLIMITED;
    if (source.containsValue("maximumImageSize")) {
        maximumImageSize = source.getInt("maximumImageSize");
    }//from   w w  w .java 2s  .co  m
    return new ImagePlantOS(source.getString("id"), source.getString("name"),
            new Date(source.getLong("creationTime")), mapToAmazonS3Bucket((BasicDBObject) source.get("bucket")),
            source.getString("masterTemplateName"), source.getLong("numberOfTemplates"), maximumImageSize);
}

From source file:com.images3.data.impl.MongoDBObjectMapper.java

License:Apache License

public AmazonS3Bucket mapToAmazonS3Bucket(BasicDBObject source) {
    return new AmazonS3Bucket(source.getString("accessKey"), source.getString("secretKey"),
            source.getString("name"));
}

From source file:com.images3.data.impl.MongoDBObjectMapper.java

License:Apache License

public TemplateOS mapToTemplateOS(BasicDBObject source) {
    return new TemplateOS(new TemplateIdentity(source.getString("imagePlantId"), source.getString("name")),
            source.getBoolean("isArchived"), source.getBoolean("isRemovable"),
            mapToResizingConfig((BasicDBObject) source.get("resizingConfig")));
}

From source file:com.images3.data.impl.MongoDBObjectMapper.java

License:Apache License

public ResizingConfig mapToResizingConfig(BasicDBObject source) {
    return new ResizingConfig(ResizingUnit.valueOf(source.getString("unit")), source.getInt("width"),
            source.getInt("height"), source.getBoolean("isKeepProportions"));
}