Example usage for com.mongodb BasicDBObject append

List of usage examples for com.mongodb BasicDBObject append

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject append.

Prototype

@Override
public BasicDBObject append(final String key, final Object val) 

Source Link

Document

Add a key/value pair to this object

Usage

From source file:com.ikanow.infinit.e.data_model.store.MongoDbUtil.java

License:Apache License

public static BasicDBObject encode(JsonObject o) {
    BasicDBObject dbo = new BasicDBObject();
    for (Map.Entry<String, JsonElement> elKV : o.entrySet()) {
        dbo.append(elKV.getKey(), encodeUnknown(elKV.getValue()));
    }/*from   w ww  .j a  va 2 s . com*/
    return dbo;
}

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java

License:Open Source License

/**
 * getQuery/* ww  w  .  j a v a  2 s  .  c  o  m*/
 * @param g
 * @param hasGeoindex
 * @param attempt
 * @return
 */
//TODO (INF-1864): running this in non-strict mode can cripple the DB since search field might not
//be set ... at least need to cache such queries (almost always the US every time!)....

private static BasicDBObject getQuery(Boolean hasGeoindex, int attempt) {
    BasicDBObject query = new BasicDBObject();

    // SearchField
    String searchField = (geoInfo.getSearch_field() != null) ? geoInfo.getSearch_field().toLowerCase() : null;

    // Cities are all lower case in the georeference collection, set toLowerCase here
    String city = (geoInfo.getCity() != null) ? geoInfo.getCity().toLowerCase() : null;

    // Use WordUtils.capitalize to set first char of region and country words to Upper Case
    String region = (geoInfo.getRegion() != null) ? WordUtils.capitalize(geoInfo.getRegion()) : null;
    String country = (geoInfo.getCountry() != null) ? WordUtils.capitalize(geoInfo.getCountry()) : null;
    String countryCode = geoInfo.getCountry_code();

    // If the only field sent was the search_field
    if ((searchField != null) && (city == null) && (region == null) && (country == null)
            && (countryCode == null)) {
        query.put("search_field", searchField);
    }

    // Otherwise...
    else {
        switch (attempt) {
        case 1:
            // Set the searchField if it is null
            if (searchField == null && city != null)
                searchField = city.toLowerCase();
            if (searchField == null && region != null)
                searchField = region.toLowerCase();
            if (searchField == null && country != null)
                searchField = country.toLowerCase();

            // 
            if (searchField != null)
                query.put("search_field", searchField);
            if (city != null)
                query.put("city", city);
            if (region != null)
                query.put("region", region);
            if (country != null)
                query.put("country", country);
            if (null == searchField) { // only country code specified...
                query.put("city", new BasicDBObject(DbManager.exists_, false));
                query.put("region", new BasicDBObject(DbManager.exists_, false));
            }
            if (countryCode != null)
                query.put("country_code", countryCode);
            break;

        case 2:
            if (city != null) {
                query.put("search_field", city.toLowerCase());
                query.put("city", city);
            } else if (region != null) {
                query.put("search_field", region.toLowerCase());
                query.put("region", region);
            } else {
                query.put("search_field", country.toLowerCase());
            }

            if (country != null)
                query.put("country", country);
            if (countryCode != null)
                query.put("country_code", countryCode);
            break;

        case 3:
            if (searchField == null && region != null)
                searchField = region.toLowerCase();
            if (searchField == null && country != null)
                searchField = country.toLowerCase();

            if (searchField != null)
                query.put("search_field", searchField);
            if (region != null)
                query.put("region", region);
            if (country != null)
                query.put("country", country);
            if (countryCode != null)
                query.put("country_code", countryCode);
            break;

        default:
            if (country != null)
                query.put("search_field", country.toLowerCase());
            if (country != null)
                query.put("country", country);
            if (countryCode != null)
                query.put("country_code", countryCode);
            break;
        }
    }
    if (query.isEmpty()) {
        return null;
    }

    // Only return records with GeoIndex objects
    if (hasGeoindex) {
        BasicDBObject ne = new BasicDBObject();
        ne.append(DbManager.exists_, true);
        query.put("geoindex", ne);
    }

    return query;
}

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java

License:Open Source License

/**
 * getNearestCities/*  w w w. j  a  va2 s  . c o  m*/
 * Get n-cities near a lat/lon pair, results returned ordered by distance from
 * the lat/lon pair
 * @param lat
 * @param lon
 * @param nMaxReturns
 * @return List<GeoReferencePojo>
 */
public static List<GeoFeaturePojo> getNearestCities(DBCollection geoDb, String lat, String lon,
        int nMaxReturns) {
    try {
        // Create Double[] from lat, lon
        Double[] d = new Double[] { Double.parseDouble(lat), Double.parseDouble(lon) };

        // Build query object to return the shell equivalent of:
        // db.georeference.find({geoindex : {$near : [lat.lon]}})
        BasicDBObject query = new BasicDBObject();
        BasicDBObject near = new BasicDBObject();
        near.append("$near", d);
        query.put("geoindex", near);

        // Perform query
        DBCursor result = geoDb.find(query).limit(nMaxReturns);

        // Convert results to List<GeoReferencePojo>
        List<GeoFeaturePojo> gpl = GeoFeaturePojo.listFromDb(result,
                new TypeToken<ArrayList<GeoFeaturePojo>>() {
                });
        return gpl;
    } catch (Exception e) {
        return null;
    }
}

From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityGeoCleanser.java

License:Open Source License

public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB)
        throws NumberFormatException, UnknownHostException, MongoException {

    // Initialization (regexes and stuff)
    this.initialize();

    // Launch MongoDB query

    BasicDBObject query = userQuery;/*from  w ww .  j a  v  a2 s  .  c  o m*/
    if (null == query) {
        new BasicDBObject();
    }

    // Just get the entity list out to save a few CPU cycles
    BasicDBObject outFields = new BasicDBObject();
    outFields.append(DocumentPojo.entities_, 1);
    outFields.append(DocumentPojo.url_, 1); // (help with debugging)
    outFields.append(DocumentPojo.title_, 1); // (help with debugging) 

    DBCursor dbc = null;
    if (nLimit > 0) {
        dbc = _docsDB.find(query, outFields).limit(nLimit).skip(nSkip);
    } else { // Everything!
        dbc = _docsDB.find(query, outFields).skip(nSkip);
    }

    // Create POJO array of documents (definitely not the most efficient, but 
    // will make integration with the harvester easier)

    List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());

    // Loop over array and invoke the cleansing function for each one

    for (DocumentPojo docu : docus) {
        if (this.cleanseGeoInDocu(docu)) {
            this._nDocusModified++;

            if (bAlterDB) {

                BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_,
                        (DBObject) com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities())));
                BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0);

                // Overwrite the existing entities list with the new one 
                _docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true);
                // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it)

            } //TESTED
        }
        this._nDocusProcessed++;
    }
}

From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityGeoCleanser.java

License:Open Source License

public boolean cleanseGeoInDocu(DocumentPojo doc) {

    boolean bChangedAnything = false;

    Map<String, Candidate> dubiousLocations = new HashMap<String, Candidate>();

    Set<String> otherRegions = new HashSet<String>();
    Set<String> otherCountries = new HashSet<String>();
    Set<String> otherCountriesOrRegionsReferenced = new HashSet<String>();

    //Debug/*from   ww w  .j a  v  a2 s.c om*/
    if (_nDebugLevel >= 2) {
        System.out.println(
                "+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size());
    }

    // 1] First off, let's find anything location-based and also determine if it's bad or not 

    if (null != doc.getEntities())
        for (EntityPojo ent : doc.getEntities()) {

            boolean bStrongCandidate = false;

            // People: decompose names
            if (EntityPojo.Dimension.Where == ent.getDimension()) {

                // So locations get disambiguated to one of:
                // "<city-etc>, <region-or-country>", or "<region-or-country>"
                // though can also just be left as they are.

                String sActualName = ent.getActual_name().toLowerCase();
                if (!ent.getDisambiguatedName().toLowerCase().equals(sActualName)) {
                    // It's been disambiguated

                    //Debug
                    if (_nDebugLevel >= 3) {
                        System.out.println("disambiguous candidate: " + ent.getDisambiguatedName() + " VS "
                                + ent.getActual_name() + " ("
                                + ((null != ent.getSemanticLinks()) ? ent.getSemanticLinks().size() : 0) + ")");
                    }

                    // OK next step, is it a disambiguation to a US town?
                    String splitMe[] = ent.getDisambiguatedName().split(", ");
                    if (2 == splitMe.length) {

                        String stateOrCountry = splitMe[1];
                        Matcher m = _statesRegex.matcher(stateOrCountry);
                        if (m.find()) { // This is a US disambiguation - high risk case
                            // Short cut if state is already directly mentioned?
                            stateOrCountry = stateOrCountry.toLowerCase();

                            if (!otherRegions.contains(stateOrCountry)) { // See list below - no need to go any further

                                // OK next step - is it a possible ambiguity:
                                ArrayList<BasicDBObject> x = new ArrayList<BasicDBObject>();
                                BasicDBObject inner0_0 = new BasicDBObject(MongoDbManager.not_,
                                        Pattern.compile("US"));
                                BasicDBObject inner1_0 = new BasicDBObject("country_code", inner0_0);
                                x.add(inner1_0);

                                BasicDBObject inner0_1 = new BasicDBObject(MongoDbManager.gte_, 400000);
                                BasicDBObject inner1_1 = new BasicDBObject("population", inner0_1);
                                x.add(inner1_1);

                                BasicDBObject dbo = new BasicDBObject();
                                dbo.append("search_field", sActualName);
                                dbo.append(MongoDbManager.or_, x);

                                DBCursor dbc = _georefDB.find(dbo);
                                if (dbc.size() >= 1) { // Problems!

                                    //Create list of candidates

                                    Type listType = new TypeToken<LinkedList<GeoFeaturePojo>>() {
                                    }.getType();
                                    LinkedList<GeoFeaturePojo> grpl = new Gson()
                                            .fromJson(dbc.toArray().toString(), listType);

                                    //Debug
                                    if (_nDebugLevel >= 2) {
                                        System.out.println("\tERROR CANDIDATE: " + ent.getDisambiguatedName()
                                                + " VS " + ent.getActual_name() + " (" + dbc.count() + ")");

                                        if (_nDebugLevel >= 3) {
                                            for (GeoFeaturePojo grp : grpl) {
                                                System.out.println("\t\tCandidate:" + grp.getCity() + " / "
                                                        + grp.getRegion() + " / " + grp.getCountry());
                                            }
                                        }
                                    }

                                    Candidate candidate = new Candidate(ent, grpl, stateOrCountry);
                                    dubiousLocations.put(ent.getIndex(), candidate);
                                    bStrongCandidate = true;

                                } // if strong candidate
                            } //TESTED ("reston, virginia" after "virginia/stateorcounty" mention)
                              // (end if can't shortcut past all this)

                        } // end if a US town
                    } // end if in the format "A, B"

                } // if weak candidate
                  //TESTED

                if (!bStrongCandidate) { // Obv can't count on a disambiguous candidate:               
                    String type = ent.getType().toLowerCase();

                    if (type.equals("stateorcounty")) {
                        String disName = ent.getDisambiguatedName().toLowerCase();
                        if (_abbrStateRegex.matcher(disName).matches()) {
                            otherRegions.add(getStateFromAbbr(disName));
                        } else {
                            otherRegions.add(ent.getDisambiguatedName().toLowerCase());
                        }
                        otherCountriesOrRegionsReferenced.add("united states");
                    } //TESTED: "mich./stateorcounty"
                    else if (type.equals("country")) {
                        String disName = ent.getDisambiguatedName().toLowerCase();

                        // Translation of known badly transcribed countries:
                        // (England->UK)
                        if (disName.equals("england")) {
                            otherCountries.add("united kingdom");
                        } //TESTED
                        else {
                            otherCountries.add(ent.getDisambiguatedName().toLowerCase());
                        }
                    } else if (type.equals("region")) {
                        otherRegions.add(ent.getDisambiguatedName().toLowerCase());
                    } else if (type.equals("city")) {
                        String splitMe[] = ent.getDisambiguatedName().split(",\\s*");
                        if (2 == splitMe.length) {
                            otherCountriesOrRegionsReferenced.add(splitMe[1].toLowerCase());
                            if (this._statesRegex.matcher(splitMe[1]).find()) {
                                otherCountriesOrRegionsReferenced.add("united states");
                            } //TESTED: "lexingon, kentucky/city"
                        }
                    }
                } //TESTED: just above clauses

            } // if location

        } // (end loop over entities)

    // Debug:
    if ((_nDebugLevel >= 3) && (!dubiousLocations.isEmpty())) {
        for (String s : otherRegions) {
            System.out.println("Strong region: " + s);
        }
        for (String s : otherCountries) {
            System.out.println("Strong countries: " + s);
        }
        for (String s : otherCountriesOrRegionsReferenced) {
            System.out.println("Weak regionscountries: " + s);
        }
    }

    // 2] The requirements and algorithm are discussed in 
    // http://ikanow.jira.com/wiki/display/INF/Beta...+improving+AlchemyAPI+extraction+%28geo%29
    // Canonical cases:
    // Darfur -> Darfur, MN even though Sudan and sometimes Darfur, Sudan are present
    // Shanghai -> Shanghai, WV even though China is mentioned (and not WV)
    // Manchester -> Manchester village, NY (not Manchester, UK)
    // Philadelphia -> Philadelphia (village), NY (though NY is mentioned and not PA) 

    // We're generating the following order
    //       10] Sitting tenant with strong direct
    //       15] Large city with strong direct      
    //       20] Region with direct
    //       30] Large city with strong indirect
    //       40] Sitting tenant with strong indirect 
    //       50] Region with indirect
    //       60] Another foreign possibility with strong direct 
    //       70] Large city with weak direct
    //       72] Large city with weak indirect
    //       75] Large city with no reference 
    //       78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) 
    //       80] Sitting tenant with any weak (US) direct or indirect 
    //       90] Another foreign possibility with strong indirect 
    //      100] Another foreign possibility with weak direct 
    //      110] Another foreign possibility with weak indirect 
    //      120] Region with no reference, if there is only 1
    //      130] Sitting tenant with none of the above (ie default)
    //      140] Anything else!

    for (Map.Entry<String, Candidate> pair : dubiousLocations.entrySet()) {
        EntityPojo ent = pair.getValue().entity;
        Candidate candidate = pair.getValue();

        // 2.1] Let's analyse the "sitting tenant"

        int nPrio = 130;
        GeoFeaturePojo currLeader = null;
        int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other)

        if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner!
            nPrio = 10; // winner!
        } //TESTED: "san antonio, texas/city" vs "texas"
        else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) {
            // Indirect ref
            nPrio = 40; // good, but beatable...
        } //TESTED: "philadelphia (village), new york/city" 
        else if (otherCountries.contains("united states")) { // Weak direct ref
            nPrio = 80; // better than nothing...            
        } //TESTED: "apache, oklahoma/city"
        else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref
            nPrio = 80; // better than nothing...            
        } //TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list

        // Special case: we don't like "village":
        if ((80 != nPrio) && ent.getDisambiguatedName().contains("village")
                && !ent.getActual_name().contains("village")) {
            nPrio = 80;
        } //TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia"

        // Debug
        if (_nDebugLevel >= 2) {
            System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio);
        }

        // Alternatives
        if (nPrio > 10) {

            LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates;
            for (GeoFeaturePojo geo : geos) {

                int nAltPrio = 140;
                int nAltCase = -1;
                String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null;
                String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null;
                String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null;

                // 2.2] CASE 1: I'm a city with pop > 1M (best score 15)
                //                15] Large city with strong direct      
                //                30] Large city with strong indirect
                //                70] Large city with weak direct
                //                72] Large city with weak indirect
                //                75] Large city with no reference                

                if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) {
                    nAltCase = 1;

                    if ((null != region) && (otherRegions.contains(region))) {
                        nAltPrio = 15; // strong direct
                    } //TESTED: "dallas / Texas / United States = 15"
                    else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
                        nAltPrio = 30; // strong indirect
                    } //TESTED: "sacramento / California / United State"
                    else if ((null != country) && (otherCountries.contains(country))) {
                        nAltPrio = 70; // weak direct 
                    } //TESTED: "berlin, germany", with "germany" directly mentioned
                    else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
                        nAltPrio = 72; // weak indirect 
                    } //TESTED: "los angeles / California / United States = 72"
                    else {
                        nAltPrio = 75; // just for being big!
                    } //TESTED: "barcelona, spain"
                }

                // 2.3] CASE 2: I'm a region (best score=20, can beat current score)
                //                20] Region with direct
                //                50] Region with indirect
                //               120] Region with no reference, if there is only 1

                else if ((null == city) && (nPrio > 20)) {
                    nAltCase = 2;

                    if ((null != country) && (otherCountries.contains(country))) {
                        nAltPrio = 20; // strong direct 
                    } //TESTED: (region) "Berlin, Germany" with "Germany" mentioned
                    else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
                        nAltPrio = 50; // strong indirect 
                    } //(haven't seen, but we'll live)
                    else {
                        nAltPrio = 120; // (just for being there)
                    } //TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China"
                }

                // 2.4] CASE 3: I'm any foreign possibility (best score=60)
                //                60] Another foreign possibility with strong direct 
                //                78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) 
                //                90] Another foreign possibility with strong indirect 
                //               100] Another foreign possibility with weak direct 
                //               110] Another foreign possibility with weak indirect 

                else if (nPrio > 60) {
                    nAltCase = 3;

                    if ((null != region) && (otherRegions.contains(region))) {
                        nAltPrio = 60; // strong direct

                        // Double check we're not falling into the trap below:
                        if (!geo.getCountry_code().equals("US")) {
                            Matcher m = this._statesRegex.matcher(geo.getRegion());
                            if (m.matches()) { // non US state matching against (probably) US state, disregard)
                                nAltPrio = 140;
                            }
                        } //TESTED (same clause as below)

                    } //TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause)

                    if (nAltPrio > 60) { // (may need to re-run test)
                        if ((null != country) && (otherCountries.contains(country))) {
                            if (geo.getPopulation() < 100000) {
                                nAltPrio = 90; // strong indirect
                            } //TESTED: "washington / Villa Clara / Cuba"
                            else {
                                nAltPrio = 78; // strong indirect, with boost!                        
                            } //TESTED: "geneva, Geneve, Switzerland", pop 180K
                        } else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) {
                            nAltPrio = 100; // weak direct
                        } //TESTED: "lincoln / Lincolnshire / United Kingdom = 100"
                        else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) {
                            nAltPrio = 110; // weak indirect
                        } //(haven't seen, but we'll live)                  
                    }
                }
                // Debug:
                if ((_nDebugLevel >= 2) && (nAltPrio < 140)) {
                    System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / "
                            + geo.getCountry() + " score=" + nAltPrio);
                }

                // Outcome of results:

                if (nAltPrio < nPrio) {
                    currLeader = geo;
                    nPrio = nAltPrio;
                    nCase = nAltCase;
                }
            } // end loop over alternativse

            if (null != currLeader) { // Need to change

                if (1 == nCase) {
                    this._nMovedToLargeCity++;

                    //(Cities are lower case in georef DB for some reason)
                    String city = WordUtils.capitalize(currLeader.getCity());

                    if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original?

                        String region = currLeader.getRegion();
                        if (region.equals("District of Columbia")) { // Special special case
                            region = "D.C.";
                        }
                        String sCandidate = city + ", " + region;

                        if (!sCandidate.equals(ent.getDisambiguatedName())) {
                            ent.setDisambiguatedName(sCandidate);
                            ent.setIndex(ent.getDisambiguatedName() + "/city");
                            ent.setSemanticLinks(null);
                            bChangedAnything = true;
                        } //TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC)
                        else {
                            this._nMovedToLargeCity--;
                            _nStayedWithOriginal++;
                        } //TESTED ("Washington DC", "San Juan, Puerto Rico")
                    } //TESTED (see above)
                    else {
                        ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
                        ent.setIndex(ent.getDisambiguatedName() + "/city");
                        ent.setSemanticLinks(null);
                        bChangedAnything = true;
                    } //TESTED: "london, california/city to London, United Kingdom"
                } else if (2 == nCase) {
                    this._nMovedToRegion++;
                    ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry());
                    ent.setIndex(ent.getDisambiguatedName() + "/region");
                    ent.setSemanticLinks(null);
                    bChangedAnything = true;

                } //TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above)
                else {
                    //(Cities are lower case in georef DB for some reason)
                    String city = WordUtils.capitalize(currLeader.getCity());

                    this._nMovedToForeignCity++;
                    ent.setDisambiguatedName(city + ", " + currLeader.getCountry());
                    ent.setIndex(ent.getDisambiguatedName() + "/city");
                    ent.setSemanticLinks(null);
                    bChangedAnything = true;

                } //TESTED: "Moved geneva, new york/city to Geneva, Switzerland"

                if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) {
                    System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName());
                }
            } else {
                _nStayedWithOriginal++;
            }

        } // (if sitting tenant not holder)

    } // (end loop over candidates)      

    if ((_nDebugLevel >= 1) && bChangedAnything) {
        System.out.println("\t(((Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getUrl() + ")))");
    }

    return bChangedAnything;
}

From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityPersonCleanser.java

License:Open Source License

public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB)
        throws NumberFormatException, UnknownHostException, MongoException {

    // Initialization (regexes and stuff)
    this.initialize();

    // Launch MongoDB query

    BasicDBObject query = userQuery;/*from   w  w w.j a v a2s. c o m*/
    if (null == query) {
        new BasicDBObject();
    }

    // Just get the entity list out to save a few CPU cycles
    BasicDBObject outFields = new BasicDBObject();
    outFields.append(DocumentPojo.entities_, 1);
    outFields.append(DocumentPojo.url_, 1); // (help with debugging) 
    outFields.append(DocumentPojo.title_, 1); // (help with debugging) 

    DBCursor dbc = null;
    if (nLimit > 0) {
        dbc = docsDB.find(query, outFields).limit(nLimit).skip(nSkip);
    } else { // Everything!
        dbc = docsDB.find(query, outFields).skip(nSkip);
    }

    // Create POJO array of documents (definitely not the most efficient, but 
    // will make integration with the harvester easier)

    List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());

    // Loop over array and invoke the cleansing function for each one

    for (DocumentPojo docu : docus) {
        if (this.cleansePeopleInDocu(docu)) {
            this._nDocusModified++;

            if (bAlterDB) {

                BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_,
                        (DBObject) com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities())));
                BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0);

                // Overwrite the existing entities list with the new one 
                docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true);
                // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it)

            } //TESTED: checked on "Feed: Japan's Three Elections / 4c92863751cc2e59d612000b / 30"
        }
        this._nDocusProcessed++;
    }
}

From source file:com.ikanow.infinit.e.processing.custom.scheduler.CustomScheduleManager.java

License:Open Source License

/**
 * Look for jobs that have not started yet but are scheduled for some point in the future
 *///from  w  ww  .j  av  a  2s  .  com
public static CustomMapReduceJobPojo getJobsToRun(PropertiesManager prop_custom, boolean bLocalMode,
        boolean bHadoopEnabled) {
    try {
        // First off, check the number of running jobs - don't exceed the max
        // (seem to run into memory problems if this isn't limited?)
        if (!availableSlots(prop_custom)) {
            return null;
        }

        BasicDBObject query = new BasicDBObject();
        query.append(CustomMapReduceJobPojo.jobidS_, null);
        query.append(CustomMapReduceJobPojo.waitingOn_, new BasicDBObject(MongoDbManager.size_, 0));
        query.append(CustomMapReduceJobPojo.nextRunTime_,
                new BasicDBObject(MongoDbManager.lt_, new Date().getTime()));
        if (!bHadoopEnabled && !bLocalMode) {
            // Can only get shared queries:
            query.append("jarURL", null);
        }
        BasicDBObject updates = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, "");
        updates.append("lastRunTime", new Date());
        BasicDBObject update = new BasicDBObject(MongoDbManager.set_, updates);
        DBObject dbo = DbManager.getCustom().getLookup().findAndModify(query, null, null, false, update, true,
                false);

        if (dbo != null) {
            return CustomMapReduceJobPojo.fromDb(dbo, CustomMapReduceJobPojo.class);
        }
    } catch (Exception ex) {
        //oh noes!
        ex.printStackTrace();
    }

    return null;
}

From source file:com.ikanow.infinit.e.processing.custom.scheduler.CustomScheduleManager.java

License:Open Source License

/**
 * Look for running jobs, decide if they are complete
 *///w  w  w.  ja  v  a2 s  . c  om
public static CustomMapReduceJobPojo getJobsToMakeComplete(boolean bHadoopEnabled,
        Map<ObjectId, String> incompleteJobsMap) {
    try {
        BasicDBObject query = new BasicDBObject();
        BasicDBObject nors[] = new BasicDBObject[3];
        nors[0] = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, null);
        nors[1] = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, "CHECKING_COMPLETION");
        nors[2] = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, "");
        query.put(MongoDbManager.nor_, Arrays.asList(nors));
        BasicDBObject updates = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, "CHECKING_COMPLETION");
        updates.put(CustomMapReduceJobPojo.lastChecked_, new Date());
        BasicDBObject update = new BasicDBObject(MongoDbManager.set_, updates);
        if (!bHadoopEnabled) {
            // Can only get shared queries:
            query.append(CustomMapReduceJobPojo.jarURL_, null);
        }
        DBObject dbo = DbManager.getCustom().getLookup().findAndModify(query, update);

        if (dbo != null) {
            CustomMapReduceJobPojo cmr = CustomMapReduceJobPojo.fromDb(dbo, CustomMapReduceJobPojo.class);
            incompleteJobsMap.put(cmr._id, cmr.jobidS);
            return cmr;
        }
    } catch (Exception ex) {
        //oh noes!
        ex.printStackTrace();
    }

    return null;
}

From source file:com.ikanow.infinit.e.processing.custom.status.CustomStatusManager.java

License:Open Source License

/**
 * Sets the custom mr pojo to be complete for the
 * current job.  Currently this is done by removing the
 * jobid and updating the next runtime, increments the
 * amount of timeRan counter as well so we can calculate nextRunTime
 * /*from w  w  w.jav  a  2  s  . c  o  m*/
 * Also set lastCompletion time to now (best we can approx)
 * 
 * @param cmr
 */
public void setJobComplete(CustomMapReduceJobPojo cmr, boolean isComplete, boolean isError, float mapProgress,
        float reduceProgress, String errorMessage) {
    // First off, if complete then run custom internal engine finish routines:
    if ((null != cmr.mapper) && !cmr.mapper.isEmpty() && !cmr.mapper.equalsIgnoreCase("none")) {
        StringBuffer postTaskActivityErrors = new StringBuffer();
        int errLen = 0;
        if (null != errorMessage) {
            postTaskActivityErrors = new StringBuffer(errorMessage);
            errLen = postTaskActivityErrors.length();
        }
        InfiniteHadoopUtils.handlePostTaskActivities(cmr, isError, postTaskActivityErrors);
        if (postTaskActivityErrors.length() > errLen) {
            errorMessage = postTaskActivityErrors.toString();
        }
    } //TESTED

    // (Note, inc_ and unset_ are added in one place each, so can't use them without ensuring you combine existing uses)  
    BasicDBObject updates = new BasicDBObject();
    BasicDBObject update = new BasicDBObject();
    try {
        long nNew = 0;
        long nTotal = 0;
        if (isComplete) {
            long runtime = new Date().getTime() - cmr.lastRunTime.getTime();
            long timeFromSchedule = cmr.lastRunTime.getTime() - cmr.nextRunTime;

            updates.append(CustomMapReduceJobPojo.jobidS_, null);
            updates.append(CustomMapReduceJobPojo.jobidN_, 0);
            try {
                //if next run time reschedules to run before now, keep rescheduling until its later
                //the server could have been turned off for days and would try to rerun all jobs once a day
                long nextRunTime = CustomScheduleManager.getNextRunTime(cmr.scheduleFreq, cmr.firstSchedule,
                        cmr.nextRunTime);
                updates.append(CustomMapReduceJobPojo.nextRunTime_, nextRunTime);
            } catch (Exception e) {
            } // just carry on, we'll live...

            updates.append(CustomMapReduceJobPojo.lastCompletionTime_, new Date());
            updates.append(CustomMapReduceJobPojo.tempConfigXMLLocation_, null);
            updates.append(CustomMapReduceJobPojo.tempJarLocation_, null);
            try {
                InfiniteHadoopUtils.removeTempFile(cmr.tempConfigXMLLocation);
                InfiniteHadoopUtils.removeTempFile(cmr.tempJarLocation);
            } catch (Exception e) {
                _logger.info("job_error_removing_tempfiles=" + InfiniteHadoopUtils.createExceptionMessage(e));
            }

            BasicDBObject incs = new BasicDBObject(CustomMapReduceJobPojo.timesRan_, 1);
            //copy depencies to waitingOn
            updates.append(CustomMapReduceJobPojo.waitingOn_, cmr.jobDependencies);
            if (!isError) {
                // Counts and move and output
                nNew = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp).count();

                //TODO (INF-1159): this shouldn't really be here but it makes life much easier for now (really should be part of the m/r OutputFormat...) 
                CustomOutputManager.completeOutput(cmr, prop_custom);

                //if job was successfully, mark off dependencies
                removeJobFromChildren(cmr._id);

                // More counts:
                nTotal = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection).count();

                // Status:
                String completionStatus = "Schedule Delta: " + timeFromSchedule + "ms\nCompletion Time: "
                        + runtime + "ms\nNew Records: " + nNew + "\nTotal Records: " + nTotal;
                if (null == errorMessage) { // (I think will always be the case?)
                    errorMessage = completionStatus;
                } else {
                    errorMessage += "\n" + completionStatus;
                }
                if ((null != cmr.tempErrors) && !cmr.tempErrors.isEmpty()) { // Individual errors reported from map/combine/reduce
                    StringBuffer sb = new StringBuffer(errorMessage).append("\n\nLog Messages:\n\n");
                    for (String err : cmr.tempErrors) {
                        sb.append(err).append("\n");
                    }
                    errorMessage = sb.toString();
                    update.put(MongoDbManager.unset_, new BasicDBObject(CustomMapReduceJobPojo.tempErrors_, 1));
                }
                updates.append(CustomMapReduceJobPojo.errorMessage_, errorMessage); // (will often be null)               
            } else {
                if ((null != cmr.tempErrors) && !cmr.tempErrors.isEmpty()) { // Individual errors reported from map/combine/reduce
                    StringBuffer sb = new StringBuffer(errorMessage).append("\n\nLog Messages:\n\n");
                    for (String err : cmr.tempErrors) {
                        sb.append(err).append("\n");
                    }
                    errorMessage = sb.toString();
                    update.put(MongoDbManager.unset_, new BasicDBObject(CustomMapReduceJobPojo.tempErrors_, 1));
                }
                //failed, just append error message                              
                updates.append(CustomMapReduceJobPojo.errorMessage_, errorMessage);
                incs.append(CustomMapReduceJobPojo.timesFailed_, 1);
                cmr.timesFailed++; // (so that in memory processes can tell if a job failed)
            }
            update.append(MongoDbManager.inc_, incs);

            if (null != cmr.jobidS) {
                _logger.info("job_completion_title=" + cmr.jobtitle + " job_completion_id=" + cmr._id.toString()
                        + " job_completion_time=" + runtime + " job_schedule_delta=" + timeFromSchedule
                        + " job_completion_success=" + !isError + " job_hadoop_id=" + cmr.jobidS + "_"
                        + cmr.jobidN + " job_new_records=" + nNew + " job_total_records=" + nTotal);
            } else {
                _logger.info("job_completion_title=" + cmr.jobtitle + " job_completion_id=" + cmr._id.toString()
                        + " job_completion_time=" + runtime + " job_schedule_delta=" + timeFromSchedule
                        + " job_completion_success=" + !isError + " job_new_records=" + nNew
                        + " job_total_records=" + nTotal);
            }
        }
        updates.append(CustomMapReduceJobPojo.mapProgress_, mapProgress);
        updates.append(CustomMapReduceJobPojo.reduceProgress_, reduceProgress);
    } catch (Exception ex) {
        //ex.printStackTrace();
        _logger.info("job_error_updating_status_title=" + cmr.jobtitle + " job_error_updating_status_id="
                + cmr._id.toString() + " job_error_updating_status_message="
                + InfiniteHadoopUtils.createExceptionMessage(ex));
    } finally { // It's really bad if this doesn't happen, so do it here so that it always gets called
        if (!updates.isEmpty()) {
            update.append(MongoDbManager.set_, updates);
            // (if isComplete, should always include resetting jobidS and jobidN)
            DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id),
                    update);

            // (also set local version)
            cmr.errorMessage = errorMessage;
        }
        if (isComplete || isError) {
            // If we're derived from a source then update the source:
            if (null != cmr.derivedFromSourceKey) {

                // For a source's first run, need to grab the entire source to check if we need to override the tmin/tmax
                SourcePojo srcJustRun = null;

                if ((isComplete && !isError) && (0 == cmr.timesRan)) {
                    BasicDBObject srcQuery = new BasicDBObject(SourcePojo.key_, cmr.derivedFromSourceKey);
                    srcJustRun = SourcePojo.fromDb(DbManager.getIngest().getSource().findOne(srcQuery),
                            SourcePojo.class);
                    if (null == srcJustRun.getHarvestStatus()) { // (don't allow initial override, if one is set)
                        srcJustRun.setHarvestStatus(new SourceHarvestStatusPojo());
                    }
                    srcJustRun.getHarvestStatus().setHarvest_status(HarvestEnum.success);

                    if (null != srcJustRun) {
                        try {
                            LinkedList<CustomMapReduceJobPojo> updatedJobs = new LinkedList<CustomMapReduceJobPojo>();
                            SourcePipelineToCustomConversion.convertSourcePipeline(srcJustRun, updatedJobs,
                                    false);
                            for (CustomMapReduceJobPojo cmrUpdate : updatedJobs) {
                                if (cmrUpdate._id.equals(cmr._id)) {
                                    DbManager.getCustom().getLookup().save(cmrUpdate.toDb());
                                }
                            }
                        } catch (Exception e) {
                        } // just carry on
                    }
                } //TESTED (by hand)

                BasicDBObject query = new BasicDBObject(SourcePojo.key_, cmr.derivedFromSourceKey);
                BasicDBObject setUpdate = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                        isError ? HarvestEnum.error.toString() : HarvestEnum.success.toString());
                if (null != cmr.errorMessage) {
                    setUpdate.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, cmr.errorMessage);
                }
                BasicDBObject srcUpdate = new BasicDBObject(DbManager.set_, setUpdate);
                DbManager.getIngest().getSource().update(query, srcUpdate, false, false);
            }
        } //TESTED (by hand)
    }
}

From source file:com.ikanow.infinit.e.processing.custom.status.CustomStatusManager.java

License:Open Source License

/**
 * Updates the status of the current, active, job
 *///from w  ww . jav a 2s . c  o  m
public void updateJobPojo(ObjectId _id, String jobids, int jobidn, String xmlLocation, String jarLocation,
        CustomMapReduceJobPojo job) {
    try {
        BasicDBObject set = new BasicDBObject();
        set.append(CustomMapReduceJobPojo.jobidS_, jobids);
        set.append(CustomMapReduceJobPojo.jobidN_, jobidn);
        set.append(CustomMapReduceJobPojo.tempConfigXMLLocation_, xmlLocation);
        set.append(CustomMapReduceJobPojo.tempJarLocation_, jarLocation);
        set.append(CustomMapReduceJobPojo.errorMessage_, null);
        BasicDBObject updateObject = new BasicDBObject(MongoDbManager.set_, set);
        DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, _id),
                updateObject);

        if ((null != job) && (null != job.derivedFromSourceKey)) {
            //update to success_iteration
            BasicDBObject query = new BasicDBObject(SourcePojo.key_, job.derivedFromSourceKey);
            BasicDBObject setUpdate = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                    HarvestEnum.success_iteration.toString());
            BasicDBObject srcUpdate = new BasicDBObject(DbManager.set_, setUpdate);
            DbManager.getIngest().getSource().update(query, srcUpdate, false, false);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}