Example usage for com.mongodb DBCursor count

List of usage examples for com.mongodb DBCursor count

Introduction

In this page you can find the example usage for com.mongodb DBCursor count.

Prototype

public int count() 

Source Link

Document

Counts the number of objects matching the query.

Usage

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {/*from w  ww . j a va  2s  .c  o m*/
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return false;
        }
    } catch (Exception e) {
        //DEBUG
        //e.printStackTrace();

        return false; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    keyFields.put(SourcePojo.highestDistributionFactorStored_, 1);

    // Get and remove the sourceKey information, incorporate into source query,
    // so it's nice and simple by the time it gets to the actual query
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);

    if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless 
        if (null != sourceKeyQueryTerm) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
        }
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ]
    else if (null != sourceKeyQueryTerm) {
        boolean sourceKeyQueryComplex = false;

        if (sourceKeyQueryTerm instanceof BasicDBObject) {
            BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm;
            if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte
                for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) {
                    if (!sourceKeyQueryTermEl.equals(DbManager.in_)
                            && !sourceKeyQueryTermEl.equals(DbManager.lt_)
                            && !sourceKeyQueryTermEl.equals(DbManager.lte_)
                            && !sourceKeyQueryTermEl.equals(DbManager.gt_)
                            && !sourceKeyQueryTermEl.equals(DbManager.gte_)) {
                        sourceKeyQueryComplex = true;
                        break;
                    } //TESTED (eg ne)
                    else if (sourceKeyQueryTermEl.equals(DbManager.in_)
                            && (1 != sourceKeyQueryTermDbo.size())) {
                        sourceKeyQueryComplex = true;
                        break;
                    } //TESTED ((lt,in))
                }
            } //TESTED: (in, (gte,lt), ne)
            else {
                sourceKeyQueryComplex = true;
            } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } })
        } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a
            sourceKeyQueryComplex = true;
        }
        //TESTED ($regex)

        if (sourceKeyQueryComplex) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below
        } else {
            return false; // already have a perfectly good source key specification
        }
    } //TESTED (See combinations above)

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields);
    int count = dbc.count();

    if (count > 5000) {
        // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open)
        return false;
    } else {
        ArrayList<String> sources = new ArrayList<String>(count);
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
            sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor));
        }
        if (sources.isEmpty()) {
            throw new RuntimeException(); // will just return no splits at all, no problem
        } //TESTED
        if (1 == sources.size()) {
            query.put(DocumentPojo.sourceKey_, sources.get(0));
        } //TESTED
        else {
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
        } //TESTED

        return true;
    }
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
        BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {// w  w w  . jav  a2s.  com
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return null;
        }
    } catch (Exception e) {
        return null; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    } //TESTED
    if (null != srcTagsQuery) {
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
    // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
    if (dbc.count() > 5000) {
        // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
        return null;
    } else {
        //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
        // Build collections of objects of format { sourceKey: string or [], totalDocs }
        BasicDBList sourceKeyListCollection = new BasicDBList();
        BasicDBList sourceKeyList = null;
        int runningDocs = 0;
        int runningSources = 0;
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            if (null != sourceKey) {
                long docCount = 0L;
                try {
                    BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                    if (null != harvestStatus) {
                        docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    }
                } catch (Exception e) {
                }

                //DEBUG
                //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                if (docCount > maxCountPerTask) { // source is large enough by itself
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKey);
                    collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                    sourceKeyListCollection.add(collection);
                    // (leaving running* alone, can keep building that)
                } //TESTED (by eye, system community of demo cluster)
                else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else { // (keep) build(ing) list
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    runningDocs += docCount;
                    runningSources++;
                } //TESTED (by eye, system community of demo cluster)
            } //(end if has source key)
        } //(end loop over cursor)

        // Finish off:
        if (null != sourceKeyList) {
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
            sourceKeyListCollection.add(collection);
        } //TESTED (by eye, system community of demo cluster)

        if (sourceKeyListCollection.isEmpty()) { // query returns empty
            throw new RuntimeException("Communities contain no sources");
        }
        return sourceKeyListCollection;

    } // (end if too many source keys across the communities)
}

From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityGeoCleanser.java

License:Open Source License

public boolean cleanseGeoInDocu(DocumentPojo doc) {

    boolean bChangedAnything = false;

    Map<String, Candidate> dubiousLocations = new HashMap<String, Candidate>();

    Set<String> otherRegions = new HashSet<String>();
    Set<String> otherCountries = new HashSet<String>();
    Set<String> otherCountriesOrRegionsReferenced = new HashSet<String>();

    //Debug/* w ww  .  j a  va2  s.  c  om*/
    if (_nDebugLevel >= 2) {
        System.out.println(
                "+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size());
    }

    // 1] First off, let's find anything location-based and also determine if it's bad or not 

    if (null != doc.getEntities())
        for (EntityPojo ent : doc.getEntities()) {

            boolean bStrongCandidate = false;

            // People: decompose names
            if (EntityPojo.Dimension.Where == ent.getDimension()) {

                // So locations get disambiguated to one of:
                // "<city-etc>, <region-or-country>", or "<region-or-country>"
                // though can also just be left as they are.

                String sActualName = ent.getActual_name().toLowerCase();
                if (!ent.getDisambiguatedName().toLowerCase().equals(sActualName)) {
                    // It's been disambiguated

                    //Debug
                    if (_nDebugLevel >= 3) {
                        System.out.println("disambiguous candidate: " + ent.getDisambiguatedName() + " VS "
                                + ent.getActual_name() + " ("
                                + ((null != ent.getSemanticLinks()) ? ent.getSemanticLinks().size() : 0) + ")");
                    }

                    // OK next step, is it a disambiguation to a US town?
                    String splitMe[] = ent.getDisambiguatedName().split(", ");
                    if (2 == splitMe.length) {

                        String stateOrCountry = splitMe[1];
                        Matcher m = _statesRegex.matcher(stateOrCountry);
                        if (m.find()) { // This is a US disambiguation - high risk case
                            // Short cut if state is already directly mentioned?
                            stateOrCountry = stateOrCountry.toLowerCase();

                            if (!otherRegions.contains(stateOrCountry)) { // See list below - no need to go any further

                                // OK next step - is it a possible ambiguity:
                                ArrayList<BasicDBObject> x = new ArrayList<BasicDBObject>();
                                BasicDBObject inner0_0 = new BasicDBObject(MongoDbManager.not_,
                                        Pattern.compile("US"));
                                BasicDBObject inner1_0 = new BasicDBObject("country_code", inner0_0);
                                x.add(inner1_0);

                                BasicDBObject inner0_1 = new BasicDBObject(MongoDbManager.gte_, 400000);
                                BasicDBObject inner1_1 = new BasicDBObject("population", inner0_1);
                                x.add(inner1_1);

                                BasicDBObject dbo = new BasicDBObject();
                                dbo.append("search_field", sActualName);
                                dbo.append(MongoDbManager.or_, x);

                                DBCursor dbc = _georefDB.find(dbo);
                                if (dbc.size() >= 1) { // Problems!

                                    //Create list of candidates

                                    Type listType = new TypeToken<LinkedList<GeoFeaturePojo>>() {
                                    }.getType();
                                    LinkedList<GeoFeaturePojo> grpl = new Gson()
                                            .fromJson(dbc.toArray().toString(), listType);

                                    //Debug
                                    if (_nDebugLevel >= 2) {
                                        System.out.println("\tERROR CANDIDATE: " + ent.getDisambiguatedName()
                                                + " VS " + ent.getActual_name() + " (" + dbc.count() + ")");

                                        if (_nDebugLevel >= 3) {
                                            for (GeoFeaturePojo grp : grpl) {
                                                System.out.println("\t\tCandidate:" + grp.getCity() + " / "
                                                        + grp.getRegion() + " / " + grp.getCountry());
                                            }
                                        }
                                    }

                                    Candidate candidate = new Candidate(ent, grpl, stateOrCountry);
                                    dubiousLocations.put(ent.getIndex(), candidate);
                                    bStrongCandidate = true;

                                } // if strong candidate
                            } //TESTED ("reston, virginia" after "virginia/stateorcounty" mention)
                              // (end if can't shortcut past all this)

                        } // end if a US town
                    } // end if in the format "A, B"

                } // if weak candidate
                  //TESTED

                if (!bStrongCandidate) { // Obv can't count on a disambiguous candidate:               
                    String type = ent.getType().toLowerCase();

                    if (type.equals("stateorcounty")) {
                        String disName = ent.getDisambiguatedName().toLowerCase();
                        if (_abbrStateRegex.matcher(disName).matches()) {
                            otherRegions.add(getStateFromAbbr(disName));
                        } else {
                            otherRegions.add(ent.getDisambiguatedName().toLowerCase());
                        }
                        otherCountriesOrRegionsReferenced.add("united states");
                    } //TESTED: "mich./stateorcounty"
                    else if (type.equals("country")) {
                        String disName = ent.getDisambiguatedName().toLowerCase();

                        // Translation of known badly transcribed countries:
                        // (England->UK)
                        if (disName.equals("england")) {
                            otherCountries.add("united kingdom");
                        } //TESTED
                        else {
                            otherCountries.add(ent.getDisambiguatedName().toLowerCase());
                        }
                    } else if (type.equals("region")) {
                        otherRegions.add(ent.getDisambiguatedName().toLowerCase());
                    } else if (type.equals("city")) {
                        String splitMe[] = ent.getDisambiguatedName().split(",\\s*");
                        if (2 == splitMe.length) {
                            otherCountriesOrRegionsReferenced.add(splitMe[1].toLowerCase());
                            if (this._statesRegex.matcher(splitMe[1]).find()) {
                                otherCountriesOrRegionsReferenced.add("united states");
                            } //TESTED: "lexingon, kentucky/city"
                        }
                    }
                } //TESTED: just above clauses

            } // if location

        } // (end loop over entities)

    // Debug:
    if ((_nDebugLevel >= 3) && (!dubiousLocations.isEmpty())) {
        for (String s : otherRegions) {
            System.out.println("Strong region: " + s);
        }
        for (String s : otherCountries) {
            System.out.println("Strong countries: " + s);
        }
        for (String s : otherCountriesOrRegionsReferenced) {
            System.out.println("Weak regionscountries: " + s);
        }
    }

    // 2] The requirements and algorithm are discussed in 
    // http://ikanow.jira.com/wiki/display/INF/Beta...+improving+AlchemyAPI+extraction+%28geo%29
    // Canonical cases:
    // Darfur -> Darfur, MN even though Sudan and sometimes Darfur, Sudan are present
    // Shanghai -> Shanghai, WV even though China is mentioned (and not WV)
    // Manchester -> Manchester village, NY (not Manchester, UK)
    // Philadelphia -> Philadelphia (village), NY (though NY is mentioned and not PA) 

    // We're generating the following order
    //       10] Sitting tenant with strong direct
    //       15] Large city with strong direct      
    //       20] Region with direct
    //       30] Large city with strong indirect
    //       40] Sitting tenant with strong indirect 
    //       50] Region with indirect
    //       60] Another foreign possibility with strong direct 
    //       70] Large city with weak direct
    //       72] Large city with weak indirect
    //       75] Large city with no reference 
    //       78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) 
    //       80] Sitting tenant with any weak (US) direct or indirect 
    //       90] Another foreign possibility with strong indirect 
    //      100] Another foreign possibility with weak direct 
    //      110] Another foreign possibility with weak indirect 
    //      120] Region with no reference, if there is only 1
    //      130] Sitting tenant with none of the above (ie default)
    //      140] Anything else!

    for (Map.Entry<String, Candidate> pair : dubiousLocations.entrySet()) {
        EntityPojo ent = pair.getValue().entity;
        Candidate candidate = pair.getValue();

        // 2.1] Let's analyse the "sitting tenant"

        int nPrio = 130;
        GeoFeaturePojo currLeader = null;
        int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other)

        if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner!
            nPrio = 10; // winner!
        } //TESTED: "san antonio, texas/city" vs "texas"
        else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) {
            // Indirect ref
            nPrio = 40; // good, but beatable...
        } //TESTED: "philadelphia (village), new york/city" 
        else if (otherCountries.contains("united states")) { // Weak direct ref
            nPrio = 80; // better than nothing...            
        } //TESTED: "apache, oklahoma/city"
        else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref
            nPrio = 80; // better than nothing...            
        } //TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list

        // Special case: we don't like "village":
        if ((80 != nPrio) && ent.getDisambiguatedName().contains("village")
                && !ent.getActual_name().contains("village")) {
            nPrio = 80;
        } //TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia"

        // Debug
        if (_nDebugLevel >= 2) {
            System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio);
        }

        // Alternatives
        if (nPrio > 10) {

            LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates;
            for (GeoFeaturePojo geo : geos) {

                int nAltPrio = 140;
                int nAltCase = -1;
                String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null;
                String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null;
                String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null;

                // 2.2] CASE 1: I'm a city with pop > 1M (best score 15)
                //                15] Large city with strong direct      
                //                30] Large city with strong indirect
                //                70] Large city with weak direct
                //                72] Large city with weak indirect
                //                75] Large city with no reference                

                if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) {
                    nAltCase = 1;

                    if ((null != region) && (otherRegions.contains(region))) {
                        nAltPrio = 15; // strong direct
                    } //TESTED: "dallas / Texas / United States = 15"
                    else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
                        nAltPrio = 30; // strong indirect
                    } //TESTED: "sacramento / California / United State"
                    else if ((null != country) && (otherCountries.contains(country))) {
                        nAltPrio = 70; // weak direct 
                    } //TESTED: "berlin, germany", with "germany" directly mentioned
                    else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
                        nAltPrio = 72; // weak indirect 
                    } //TESTED: "los angeles / California / United States = 72"
                    else {
                        nAltPrio = 75; // just for being big!
                    } //TESTED: "barcelona, spain"
                }

                // 2.3] CASE 2: I'm a region (best score=20, can beat current score)
                //                20] Region with direct
                //                50] Region with indirect
                //               120] Region with no reference, if there is only 1

                else if ((null == city) && (nPrio > 20)) {
                    nAltCase = 2;

                    if ((null != country) && (otherCountries.contains(country))) {
                        nAltPrio = 20; // strong direct 
                    } //TESTED: (region) "Berlin, Germany" with "Germany" mentioned
                    else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
                        nAltPrio = 50; // strong indirect 
                    } //(haven't seen, but we'll live)
                    else {
                        nAltPrio = 120; // (just for being there)
                    } //TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China"
                }

                // 2.4] CASE 3: I'm any foreign possibility (best score=60)
                //                60] Another foreign possibility with strong direct 
                //                78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) 
                //                90] Another foreign possibility with strong indirect 
                //               100] Another foreign possibility with weak direct 
                //               110] Another foreign possibility with weak indirect 

                else if (nPrio > 60) {
                    nAltCase = 3;

                    if ((null != region) && (otherRegions.contains(region))) {
                        nAltPrio = 60; // strong direct

                        // Double check we're not falling into the trap below:
                        if (!geo.getCountry_code().equals("US")) {
                            Matcher m = this._statesRegex.matcher(geo.getRegion());
                            if (m.matches()) { // non US state matching against (probably) US state, disregard)
                                nAltPrio = 140;
                            }
                        } //TESTED (same clause as below)

                    } //TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause)

                    if (nAltPrio > 60) { // (may need to re-run test)
                        if ((null != country) && (otherCountries.contains(country))) {
                            if (geo.getPopulation() < 100000) {
                                nAltPrio = 90; // strong indirect
                            } //TESTED: "washington / Villa Clara / Cuba"
                            else {
                                nAltPrio = 78; // strong indirect, with boost!                        
                            } //TESTED: "geneva, Geneve, Switzerland", pop 180K
                        } else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
                            nAltPrio = 100; // weak direct
                        } //TESTED: "lincoln / Lincolnshire / United Kingdom = 100"
                        else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
                            nAltPrio = 110; // weak indirect
                        } //(haven't seen, but we'll live)                  
                    }
                }
                // Debug:
                if ((_nDebugLevel >= 2) && (nAltPrio < 140)) {
                    System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / "
                            + geo.getCountry() + " score=" + nAltPrio);
                }

                // Outcome of results:

                if (nAltPrio < nPrio) {
                    currLeader = geo;
                    nPrio = nAltPrio;
                    nCase = nAltCase;
                }
            } // end loop over alternativse

            if (null != currLeader) { // Need to change

                if (1 == nCase) {
                    this._nMovedToLargeCity++;

                    //(Cities are lower case in georef DB for some reason)
                    String city = WordUtils.capitalize(currLeader.getCity());

                    if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original?

                        String region = currLeader.getRegion();
                        if (region.equals("District of Columbia")) { // Special special case
                            region = "D.C.";
                        }
                        String sCandidate = city + ", " + region;

                        if (!sCandidate.equals(ent.getDisambiguatedName())) {
                            ent.setDisambiguatedName(sCandidate);
                            ent.setIndex(ent.getDisambiguatedName() + "/city");
                            ent.setSemanticLinks(null);
                            bChangedAnything = true;
                        } //TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC)
                        else {
                            this._nMovedToLargeCity--;
                            _nStayedWithOriginal++;
                        } //TESTED ("Washington DC", "San Juan, Puerto Rico")
                    } //TESTED (see above)
                    else {
                        ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
                        ent.setIndex(ent.getDisambiguatedName() + "/city");
                        ent.setSemanticLinks(null);
                        bChangedAnything = true;
                    } //TESTED: "london, california/city to London, United Kingdom"
                } else if (2 == nCase) {
                    this._nMovedToRegion++;
                    ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry());
                    ent.setIndex(ent.getDisambiguatedName() + "/region");
                    ent.setSemanticLinks(null);
                    bChangedAnything = true;

                } //TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above)
                else {
                    //(Cities are lower case in georef DB for some reason)
                    String city = WordUtils.capitalize(currLeader.getCity());

                    this._nMovedToForeignCity++;
                    ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
                    ent.setIndex(ent.getDisambiguatedName() + "/city");
                    ent.setSemanticLinks(null);
                    bChangedAnything = true;

                } //TESTED: "Moved geneva, new york/city to Geneva, Switzerland"

                if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) {
                    System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName());
                }
            } else {
                _nStayedWithOriginal++;
            }

        } // (if sitting tenant not holder)

    } // (end loop over candidates)      

    if ((_nDebugLevel >= 1) && bChangedAnything) {
        System.out.println("\t(((Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getUrl() + ")))");
    }

    return bChangedAnything;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java

License:Open Source License

public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) {

    // Performance shortcut:
    if (!_bCalculatedMostRecentlyModifiedFile) {
        _bCalculatedMostRecentlyModifiedFile = true;
        // Get date of most recently modified file:
        try {/* w ww  .  j  av  a 2s  .c  om*/
            if ((null != source.getHarvestStatus())
                    && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) {
                BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                        source.getDistributedKeyQueryTerm());
                if (null != source.getDistributionFactor()) { // if distributed, then apply extra term
                    if ((null != source.getHarvestStatus())
                            && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) {
                        Date d = source.getHarvestStatus().getDistributedLastCompletedCycle();
                        mostRecentQuery.put(DocumentPojo._id_,
                                new BasicDBObject(DbManager.lte_, new ObjectId(d)));
                    }
                } //TESTED

                BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1);
                BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1);
                if (null != source.getDistributionFactor()) { // (need the created date also 
                    mostRecentFields.put(DocumentPojo.created_, 1);
                }
                DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata()
                        .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1);
                if (mostRecentDocs.hasNext()) {
                    BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next();
                    _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_);
                    _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_);

                } //TESTED (found docs)

                //DEBUG
                //if (null != _mostRecentlyModifiedDocId)
                //   System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime()));

            } //(success mode)            
        } catch (Exception e) {
        } // If anything goes wrong will just check all files (slower)

    } //TESTED

    if (null != _mostRecentlyModifiedFile) { // Use short cut...         
        long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L;
        long nFileTime = modifiedDate.getTime() / 1000L;

        if (nFileTime <= nMostRecentlyModifiedTime) {
            return false;
        }
    } //TESTED

    if (null == url) { // use this call with url==null to just check the modified file...
        return true;
    }

    // No short cut, go the long way round:

    DBCollection collection = DbManager.getDocument().getMetadata();
    boolean ret = true;
    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, url);
    query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm());
    BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1);

    DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1)
    boolean foundMatch = dbc.hasNext();

    if (!foundMatch) { //if there is no record, return true
        ret = true;
    } else {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        Date oldModified = (Date) dbo.get(DocumentPojo.modified_);

        if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match
            if (!dbc.hasNext()) { // 1 matching doc, different modified times so update
                ret = true;
            } //TESTED
            else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur)
                // (slightly slow but should be OK because not going to happen very often)               
                int nCount = dbc.count();
                query.put(DocumentPojo.modified_, modifiedDate);
                ret = !(collection.find(query).limit(1).count() == nCount);
            } //TOTEST (shouldn't ever occur)         
        } else { // Doc has same modified time so don't update
            ret = false;
        } //TESTED
    }
    return ret;
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java

License:Open Source License

@Override
public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) {
    if (_isDirectory) {
        if (_isShare) { // must be a zip file
            ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>();
            @SuppressWarnings("unchecked")
            Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries();
            while (entries.hasMoreElements()) {
                net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement();
                InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName());
                zipFiles.add(newFile);/*w  ww .j  av  a 2 s.c om*/
            }
            return zipFiles.toArray(new InfiniteFile[zipFiles.size()]);
        } //TESTED (3.2)
        else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory"
            String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_);
            String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_);
            if (null == outputDatabase) {
                outputDatabase = "custommr";
            }
            DBCollection outColl = null;
            DBCursor dbc = null;
            if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory

                DBCollection chunks = MongoDbManager.getCollection("config", "chunks");
                StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection);
                dbc = chunks.find(new BasicDBObject("ns", ns.toString()));
                int splits = dbc.count();

                if (splits < 2) { // Nothing to do (unsharded or 1 chunk)
                    dbc.close();

                    outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                    dbc = outColl.find();
                } //TESTED (4.2)
                else { // Create one virtual dir per split
                    InfiniteFile[] virtualDirs = new InfiniteFile[splits];
                    int added = 0;
                    for (DBObject splitObj : dbc) {
                        BasicDBObject minObj = (BasicDBObject) splitObj.get("min");
                        BasicDBObject maxObj = (BasicDBObject) splitObj.get("max");
                        ObjectId minId = null;
                        try {
                            minId = (ObjectId) minObj.get("_id");
                        } catch (Exception e) {
                        } // min key..
                        ObjectId maxId = null;
                        try {
                            maxId = (ObjectId) maxObj.get("_id");
                        } catch (Exception e) {
                        } // max key..

                        //Handle current case where custom jobs are all dumped in with the wrong _id type                     
                        if ((null != minId) || (null != maxId)) {
                            if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below)

                                if (maxId.getTime() < optionalFilterDate.getTime()) {
                                    // (the "getTime()"s can overlap across chunks so we have to use minId
                                    //  and accept that we'll often deserialize 1+ extra chunk every harvest)
                                    continue;
                                }
                            } //TESTED (by hand)

                            InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId);
                            virtualDirs[added] = split;
                            added++;
                        } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand)
                    }
                    dbc.close();
                    return virtualDirs;
                } //TESTED (5.2.2, 6.2.2)
            } //TESTED
            else { // Virtual directory
                BasicDBObject query = new BasicDBObject();
                if (null != _virtualDirStartLimit) {
                    if (null != optionalFilterDate) {
                        ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                        //(zero out the inc/machine ids so this query is independent to calling service)

                        if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit)
                            query.put(MongoDbManager.gte_, altStartId);
                        } else {
                            query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                        }
                    } //TESTED (by hand)
                    else { // normal case
                        query.put(MongoDbManager.gte_, _virtualDirStartLimit);
                    }
                } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable)
                    ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0);
                    query.put(MongoDbManager.gte_, altStartId);
                } //TESTED (by hand)
                if (null != _virtualDirEndLimit) {
                    query.put(MongoDbManager.lt_, _virtualDirEndLimit);
                }

                outColl = MongoDbManager.getCollection(outputDatabase, outputCollection);
                dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle);
            } //TESTED (6.2.2) (doc skipping by hand)

            if (null != outColl) { // has files, create the actual file objects
                //DEBUG
                //System.out.println("CHUNK: GOT " + dbc.count());

                int docCount = dbc.count();
                if (docCount > 1 + maxDocsPerCycle) {
                    docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway)
                }
                InfiniteFile[] docs = new InfiniteFile[docCount];
                int added = 0;
                for (DBObject docObj : dbc) {
                    // (if didn't use a query then apply internal filter date by hand)
                    if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)
                            && (null != optionalFilterDate)) {
                        ObjectId docId = (ObjectId) docObj.get("_id");
                        if (optionalFilterDate.getTime() > docId.getTime()) {
                            continue;
                        }
                    } //TESTED

                    if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to)
                        docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"),
                                _virtualDirEndLimit);
                        break;
                    } else {
                        InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj);
                        docs[added] = doc;
                    } //TESTED (both cases)
                    added++;
                }
                dbc.close();
                return docs;

            } //TESTED (4.2)
        }
    } else { // can just return myself
        InfiniteFile[] retVal = new InfiniteFile[1];
        retVal[0] = this;
        return retVal;
    } //TESTED (1.2, 2.2)
    return null;
}

From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 

    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex("association_index");
    //elasticManager.deleteMe();

    // Create the index if necessary
    String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
            AssociationFeaturePojoIndexMap.Mapping.class);
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping,
            localSettings);//from   ww w. java 2  s  .  c o  m

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex("association_index");
    }

    // Now query the DB:

    DBCursor dbc = null;
    dbc = eventFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>();

    int nSynced = 0;

    // Loop over array and invoke the cleansing function for each one
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class);

        // If this table has just been rebuilt from the document then the indexes are all wrong ...
        // recalculate and save
        if ('#' == evt.getIndex().charAt(0)) {
            AssociationPojo singleEvt = new AssociationPojo();
            singleEvt.setEntity1_index(evt.getEntity1_index());
            singleEvt.setEntity2_index(evt.getEntity2_index());
            singleEvt.setVerb_category(evt.getVerb_category());
            singleEvt.setGeo_index(evt.getGeo_index());
            evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt));
            eventFeatureDB
                    .update(new BasicDBObject("_id", dbo.get("_id")),
                            new BasicDBObject(MongoDbManager.set_,
                                    new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())),
                            false, true);
            // (has to be a multi-update even though it's unique because it's sharded on index)
        }

        // Handle groups (system group is: "4c927585d591d31d7b37097a")
        if (null == evt.getCommunityId()) {
            evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
        }
        // Bulk add prep
        events.add(evt);
        nSynced++;

        if (events.size() > 1000) {
            elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events,
                    AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null,
                    true);
            events.clear();
        }
    }
    // End loop over entities

    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(),
            new AssociationFeaturePojoIndexMap()), "_id", null, true);

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }
}

From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

License:Apache License

private void doDelete(BasicDBObject query, int nLimit) {
    try {/*from  w  w  w .j av a2  s .com*/
        // Initialize the DB:   
        DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

        DBCursor cur = eventFeatureDB.find(query).limit(nLimit);
        // (this internally works in batches of 1000; just get _id)
        System.out.println("Found " + cur.count() + " records to delete");
        if (nLimit > 0) {
            System.out.println("(limited to " + nLimit + " records)");
        }

        ArrayList<AssociationFeaturePojo> events = new ArrayList<AssociationFeaturePojo>();
        LinkedList<String> eventIds = new LinkedList<String>();
        while (cur.hasNext()) {
            AssociationFeaturePojo event = AssociationFeaturePojo.fromDb(cur.next(),
                    AssociationFeaturePojo.class);
            events.add(event);
            eventIds.add(
                    new StringBuffer(event.getIndex()).append(":").append(event.getCommunityId()).toString());
            eventFeatureDB.remove(new BasicDBObject("index", event.getIndex()));
        }
        ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("association_index");
        elasticManager.bulkDeleteDocuments(eventIds);

    } catch (NumberFormatException e) {
        e.printStackTrace();
    } catch (MongoException e) {
        e.printStackTrace();
    }
}

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk)
        throws IOException {
    PropertiesManager pm = new PropertiesManager();
    int nMaxContentSize_bytes = pm.getMaxContentSize();

    // Initialize the DB:

    DBCollection docsDB = DbManager.getDocument().getMetadata();
    DBCollection contentDB = DbManager.getDocument().getContent();
    DBCollection sourcesDB = DbManager.getIngest().getSource();

    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // 1. Get the documents from the DB (combining data + metadata and refreshing source meta)

    // (Ignore soft-deleted records:)
    if (null == query) {
        query = new BasicDBObject();
    }//from  w  w  w.  j  a va  2  s . c om
    Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException(
                    "Can't specify sourceKey as part of complex query term: " + query.toString());
        } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        if (sourceKeyQueryTerm instanceof String) {
            query.put(DocumentPojo.sourceKey_,
                    SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm));
        } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }")
        else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution
            BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
            fields.put(SourcePojo.highestDistributionFactorStored_, 1);
            DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields);
            LinkedList<String> sourceKeys = new LinkedList<String>();
            for (DBObject dbo : dbc) {
                String key = (String) dbo.get(SourcePojo.key_);
                Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor);
                sourceKeys.addAll(sourceKeysForSource);
            }
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
        } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }")
        else {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //(actually not possible, just included here for mathematical completeness...)         
    } else {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        // Optimize communityId into sourceKeys...
        if (null != query.get(DocumentPojo.communityId_)) {
            try {
                ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
                BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
                fields.put(SourcePojo.highestDistributionFactorStored_, 1);
                DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields);
                LinkedList<String> sourceKeys = new LinkedList<String>();
                int added = 0;
                for (DBObject dbo : dbc) {
                    String key = (String) dbo.get(SourcePojo.key_);
                    Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                    Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key,
                            distributionFactor);
                    sourceKeys.addAll(sourceKeysForSource);
                    added += sourceKeysForSource.size();
                }
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));

                System.out.println("(Optimized simple community query to " + added + " source key(s))");
            } catch (Exception e) {
                //DEBUG
                //e.printStackTrace();

                System.out.println("(Can't optimize complex community query: " + e.getMessage());
            }
        } //TESTED (by hand - including distributed source version)
    }
    // Ignored delete objects
    Object urlQuery = query.get(DocumentPojo.url_);
    if (null == urlQuery) {
        query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
    } //TESTED
    else if (urlQuery instanceof BasicDBObject) {
        ((BasicDBObject) urlQuery).append("$regex", "^[^?]");
    } //TESTED
      //DEBUG
      //System.out.println("COMBINED QUERY= " + query.toString());

    // If aggregating, kick off the background aggregation thread
    if (bAggregate) {
        EntityBackgroundAggregationManager.startThread();
        AssociationBackgroundAggregationManager.startThread();
    }

    //Debug:
    DBCursor dbc = null;
    dbc = docsDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    byte[] storageArray = new byte[200000];

    int nSynced = 0;
    LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
    Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
    ObjectId currCommunityId = null;
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        String sDocIndex = doc.getIndex();
        if (null == sDocIndex) {
            sDocIndex = "document_index";
        }
        if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
            _deletedIndex.add(sDocIndex);
            rebuildIndex(sDocIndex);
            try { // (Just in case the index requires some time to sort itself out)
                Thread.sleep(1000);
            } catch (InterruptedException e) {
            }
        }

        //Debug:
        //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());

        // Get the content:
        if ((0 != nMaxContentSize_bytes)
                && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_,
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
            fields.put(CompressedFullTextPojo.sourceKey_, 1);

            DBCursor dbcGzip = contentDB.find(contentQ, fields);
            while (dbcGzip.hasNext()) {
                BasicDBObject dboContent = (BasicDBObject) dbcGzip.next();
                if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) {
                    // If this has another version then ignore this one...
                    if (dbc.hasNext()) {
                        continue;
                    } //TESTED (by hand)               
                }

                byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = 0;
                StringBuffer output = new StringBuffer();
                while (nRead >= 0) {
                    nRead = gzip.read(storageArray, 0, 200000);
                    if (nRead > 0) {
                        String s = new String(storageArray, 0, nRead, "UTF-8");
                        output.append(s);
                    }
                }
                doc.setFullText(output.toString());
            }
        }
        // (else document has full text already)

        // Get tags, if necessary:
        // Always overwrite tags - one of the reasons we might choose to migrate
        // Also may need source in order to support source index filtering
        SourcePojo src = _sourceCache.get(doc.getSourceKey());
        if (null == src) {
            //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
            BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                    .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
            if (null != srcDbo) {
                src = SourcePojo.fromDb(srcDbo, SourcePojo.class);

                if (null != src.getProcessingPipeline()) {
                    try {
                        // Set the index settings
                        HarvestController hc = new HarvestController();
                        HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
                        hcPipe.extractSource_preProcessingPipeline(src, hc);
                    } catch (Exception e) {
                        //DEBUG
                        e.printStackTrace();
                    }
                } //TESTED (by hand)

                _sourceCache.put(doc.getSourceKey(), src);
            }
        }
        doc.setTempSource(src); // (needed for source index filtering)
        if (null != src) {
            if (null != src.getTags()) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();
                    tagsTidied.add(ss);
                }

                // May also want to write this back to the DB:
                //TODO (INF-2223): Handle append tags or not in the pipeline...
                if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
                    if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
                        BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                                doc.getRawSourceKey()); // (ie including the # if there is one)
                        updateQuery.put(DocumentPojo._id_, doc.getId());
                        docsDB.update(updateQuery,
                                new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_,
                                        new BasicDBObject(DbManager.each_, tagsTidied))));
                    }
                    doc.setTags(tagsTidied); // (just copy ptr across)
                }
            }
        }

        // 2. Update the index with the new document            

        // (Optionally also update entity and assoc features)

        if (bAggregate) {
            if (null == currCommunityId) {
                currCommunityId = doc.getCommunityId();
            } else if (!currCommunityId.equals(doc.getCommunityId())) {
                LinkedList<DocumentPojo> perCommunityDocList = null;
                if (null == communityList) { // (very first time we see > 1 community)
                    communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                currCommunityId = doc.getCommunityId();
                perCommunityDocList = communityList.get(currCommunityId);
                if (null == perCommunityDocList) {
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                perCommunityDocList.add(doc);
            }
        } //TESTED

        nSynced++;
        docsToTransfer.add(doc);
        if (0 == (nSynced % 10000)) {
            StoreAndIndexManager manager = new StoreAndIndexManager();

            if (bAggregate) {
                // Loop over communities and aggregate each one then store the modified entities/assocs               
                doAggregation(communityList, docsToTransfer);
                communityList = null; // (in case the next 10,000 docs are all in the same community!)
                currCommunityId = null;

            } //TOTEST            

            manager.addToSearch(docsToTransfer);
            docsToTransfer.clear();
            System.out.println("(Synced " + nSynced + " records)");
        }

    } // (End loop over docs)

    // Sync remaining docs

    if (!docsToTransfer.isEmpty()) {
        if (bAggregate) {
            // Loop over communities and aggregate each one then store the modified entities/assocs               
            doAggregation(communityList, docsToTransfer);
        }

        StoreAndIndexManager manager = new StoreAndIndexManager();
        manager.addToSearch(docsToTransfer);
    }

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }

    if (bAggregate) {
        System.out.println("Completed. You can hit CTRL+C at any time.");
        System.out.println(
                "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
        try {
            Thread.sleep(300000);
        } catch (InterruptedException e) {
        }

        // Turn off so we can exit
        EntityBackgroundAggregationManager.stopThreadAndWait();
        AssociationBackgroundAggregationManager.stopThreadAndWait();
    }
}

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doDelete(BasicDBObject query, int nLimit) {
    try {/*from  w  w  w  .ja  v  a2  s.c o m*/
        // Get the documents to delete
        BasicDBObject queryFields = new BasicDBObject(DocumentPojo.sourceKey_, 1);
        queryFields.put(DocumentPojo.sourceUrl_, 1);
        queryFields.put(DocumentPojo.url_, 1);
        queryFields.put(DocumentPojo.communityId_, 1);
        queryFields.put(DocumentPojo.index_, 1);

        DBCursor cur = DbManager.getDocument().getMetadata().find(query, queryFields).limit(nLimit);
        // (this internally works in batches of 1000)         
        System.out.println("Found " + cur.count() + " records to delete");
        if (nLimit > 0) {
            System.out.println("(limited to " + nLimit + " records)");
        }

        List<DocumentPojo> docs = DocumentPojo.listFromDb(cur, DocumentPojo.listType());

        // Keep track of number of docs per community getting deleted
        Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>();
        Map<String, Integer> sourceKeyMap = new HashMap<String, Integer>();
        for (DocumentPojo doc : docs) {
            if (null != doc.getSourceKey()) { // (can only happen by error, still)
                ObjectId community = doc.getCommunityId();
                Integer count = communityMap.get(community);
                communityMap.put(community, (count == null ? 1 : count + 1));
                String sourceKey = doc.getSourceKey();
                Integer count2 = sourceKeyMap.get(sourceKey);
                sourceKeyMap.put(sourceKey, (count2 == null ? 1 : count2 + 1));
            }
        }
        StoreAndIndexManager dataStore = new StoreAndIndexManager();
        dataStore.removeFromDatastore_byURL(docs, null);
        AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
        dataStore.removeSoftDeletedDocuments();
        AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());

        // Actually update the DB counts:
        for (Map.Entry<ObjectId, Integer> communityInfo : communityMap.entrySet()) {
            System.out.println("Removed " + communityInfo.getValue() + " records from community "
                    + communityInfo.getKey());
            DbManager.getDocument().getCounts().update(new BasicDBObject("_id", communityInfo.getKey()),
                    new BasicDBObject("$inc", new BasicDBObject("doccount", -communityInfo.getValue())));
        }
        for (Map.Entry<String, Integer> sourceInfo : sourceKeyMap.entrySet()) {
            System.out.println(
                    "Removed " + sourceInfo.getValue() + " records from source " + sourceInfo.getKey());
            DbManager.getIngest().getSource().update(new BasicDBObject("key", sourceInfo.getKey()),
                    new BasicDBObject("$inc", new BasicDBObject("harvest.doccount", -sourceInfo.getValue())));
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection entityFeatureDB = DbManager.getFeature().getEntity();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 

    String indexName = "entity_index";
    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex(indexName);
    //elasticManager.deleteMe();

    // Create the index if necessary
    String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
            EntityFeaturePojoIndexMap.Mapping.class);
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings);

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex(indexName);
    }/*from w w  w .  j  av a2 s.  c o m*/

    // Now query the DB:

    DBCursor dbc = null;
    dbc = entityFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    int nSynced = 0;

    List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>();
    while (dbc.hasNext()) {
        EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class);

        if (null != feature.getAlias()) { // (some corrupt gazateer entry)

            // Handle groups (system group is: "4c927585d591d31d7b37097a")
            // if there is no community id, add system group (something is wrong if this happens?)
            if (null == feature.getCommunityId()) {
                feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
            }
        }

        entities.add(feature);
        nSynced++;

        // Add the entities
        if (entities.size() > 1000) {
            elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
                    new EntityFeaturePojoIndexMap()), "_id", null, true);
            // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

            entities = new ArrayList<EntityFeaturePojo>();
        }
    }
    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
            new EntityFeaturePojoIndexMap()), "_id", null, true);
    // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }
}