Example usage for com.mongodb BasicDBObject get

List of usage examples for com.mongodb BasicDBObject get

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject get.

Prototype

public Object get(final String key) 

Source Link

Document

Gets a value from this object

Usage

From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java

License:Open Source License

private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable,
        String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer,
        String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue,
        String arguments, Boolean incrementalMode, ObjectId userId, Boolean selfMerge,
        String originalOutputCollection, Boolean appendResults) throws IOException {
    String dbserver = prop_general.getDatabaseServer();
    output = outputDatabase + "." + tempOutputCollection;

    boolean isAdmin = AuthUtils.isAdmin(userId);

    int nSplits = 8;
    int nDocsPerSplit = 12500;

    //add communities to query if this is not a custom table
    BasicDBObject oldQueryObj = null;
    BasicDBObject srcTags = null;// w ww  .  ja va 2s  .co m
    // Start with the old query:
    if (query.startsWith("{")) {
        oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query);
    } else {
        oldQueryObj = new BasicDBObject();
    }
    boolean elasticsearchQuery = oldQueryObj.containsField("qt") && !isCustomTable;
    int nLimit = 0;
    if (oldQueryObj.containsField("$limit")) {
        nLimit = oldQueryObj.getInt("$limit");
        oldQueryObj.remove("$limit");
    }
    if (oldQueryObj.containsField("$splits")) {
        nSplits = oldQueryObj.getInt("$splits");
        oldQueryObj.remove("$splits");
    }
    if (oldQueryObj.containsField("$srctags")) {
        srcTags = new BasicDBObject(SourcePojo.tags_, oldQueryObj.get("$srctags"));
        oldQueryObj.remove("$srctags");
    }
    if (bLocalMode) { // If in local mode, then set this to a large number so we always run inside our limit/split version
        // (since for some reason MongoInputFormat seems to fail on large collections)
        nSplits = InfiniteMongoSplitter.MAX_SPLITS;
    }
    if (oldQueryObj.containsField("$docsPerSplit")) {
        nDocsPerSplit = oldQueryObj.getInt("$docsPerSplit");
        oldQueryObj.remove("$docsPerSplit");
    }
    oldQueryObj.remove("$fields");
    oldQueryObj.remove("$output");
    oldQueryObj.remove("$reducers");
    String mapperKeyClass = oldQueryObj.getString("$mapper_key_class", "");
    String mapperValueClass = oldQueryObj.getString("$mapper_value_class", "");
    oldQueryObj.remove("$mapper_key_class");
    oldQueryObj.remove("$mapper_value_class");
    String cacheList = null;
    Object cacheObj = oldQueryObj.get("$caches");
    if (null != cacheObj) {
        cacheList = cacheObj.toString(); // (either array of strings, or single string)
        if (!cacheList.startsWith("[")) {
            cacheList = "[" + cacheList + "]"; // ("must" now be valid array)
        }
        oldQueryObj.remove("$caches");
    } //TESTED

    if (null != nDebugLimit) { // (debug mode override)
        nLimit = nDebugLimit;
    }
    boolean tmpIncMode = (null != incrementalMode) && incrementalMode;

    Date fromOverride = null;
    Date toOverride = null;
    Object fromOverrideObj = oldQueryObj.remove("$tmin");
    Object toOverrideObj = oldQueryObj.remove("$tmax");
    if (null != fromOverrideObj) {
        fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true);
    }
    if (null != toOverrideObj) {
        toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false);
    }

    if (!isCustomTable) {
        if (elasticsearchQuery) {
            oldQueryObj.put("communityIds", communityIds);
            //tmin/tmax not supported - already have that capability as part of the query
        } else {
            if (input.equals("feature.temporal")) {
                if ((null != fromOverride) || (null != toOverride)) {
                    oldQueryObj.put("value.maxTime",
                            InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, true));
                } //TESTED
                oldQueryObj.put("_id.c", new BasicDBObject(DbManager.in_, communityIds));
            } else {
                oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds));
                if ((null != fromOverride) || (null != toOverride)) {
                    oldQueryObj.put("_id",
                            InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false));
                } //TESTED         
                if (input.equals("doc_metadata.metadata")) {
                    oldQueryObj.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (ensures not soft-deleted)
                }
            }
        }
    } else {
        if ((null != fromOverride) || (null != toOverride)) {
            oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false));
        } //TESTED
          //get the custom table (and database)
        input = CustomOutputManager.getCustomDbAndCollection(input);
    }
    query = oldQueryObj.toString();

    if (arguments == null)
        arguments = "";

    // Generic configuration
    out.write("<?xml version=\"1.0\"?>\n<configuration>");

    // Mongo specific configuration
    out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title
            + "</value></property>"
            + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>"
            + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>"
            + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://"
            + dbserver + "/" + input + "</value></property>"
            + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://"
            + dbserver + "/" + output + "</value>  </property>"
            + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>"
            + StringEscapeUtils.escapeXml(query) + "</value></property>"
            + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>"
            + ((fields == null) ? ("") : fields) + "</value></property>"
            + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>"
            + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>"
            + nLimit + "</value><!-- 0 == no limit --></property>"
            + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>"
            + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper
            + "</value></property>"
            + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer
            + "</value></property>"
            + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>"
            + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat</value></property>"
            + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>"
            + outputKey + "</value></property>"
            + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>"
            + outputValue + "</value></property>"
            + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value>"
            + mapperKeyClass + "</value></property>"
            + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value>"
            + mapperValueClass + "</value></property>"
            + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>"
            + combiner + "</value></property>"
            + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>"
            + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>"
            + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>");

    // Infinit.e specific configuration

    out.write("\n\t<property><!-- User Arguments [optional] --><name>infinit.e.userid</name><value>"
            + StringEscapeUtils.escapeXml(userId.toString()) + "</value></property>"
            + "\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>"
            + StringEscapeUtils.escapeXml(arguments) + "</value></property>"
            + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>"
            + nSplits + "</value></property>"
            + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>"
            + nDocsPerSplit + "</value></property>"
            + "\n\t<property><!-- Infinit.e incremental mode [optional] --><name>update.incremental</name><value>"
            + tmpIncMode + "</value></property>"
            + "\n\t<property><!-- Infinit.e quick admin check [optional] --><name>infinit.e.is.admin</name><value>"
            + isAdmin + "</value></property>"
            + "\n\t<property><!-- Infinit.e userid [optional] --><name>infinit.e.userid</name><value>" + userId
            + "</value></property>");
    if (null != cacheList) {
        out.write(
                "\n\t<property><!-- Infinit.e cache list [optional] --><name>infinit.e.cache.list</name><value>"
                        + cacheList + "</value></property>");
    } //TESTED
    if (null != srcTags) {
        out.write(
                "\n\t<property><!-- Infinit.e src tags filter [optional] --><name>infinit.e.source.tags.filter</name><value>"
                        + srcTags.toString() + "</value></property>");
    }

    if (null != selfMerge && selfMerge && originalOutputCollection != null) {
        originalOutputCollection = "mongodb://" + dbserver + "/" + outputDatabase + "."
                + originalOutputCollection;
        out.write(
                "\n\t<property><!-- This jobs output collection for passing into the mapper along with input collection [optional] --><name>infinit.e.selfMerge</name><value>"
                        + originalOutputCollection + "</value></property>");
    }

    // Closing thoughts:
    out.write("\n</configuration>");

    out.flush();
    out.close();
}

From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java

License:Open Source License

public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields)
        throws SAXException, IOException, ParserConfigurationException {

    BasicDBList dbl = new BasicDBList();

    PropertiesManager props = new PropertiesManager();
    Configuration conf = getConfiguration(props);

    Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);

    @SuppressWarnings({ "unchecked", "rawtypes" })
    SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable(
            pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf);

    // Very basic, only allow top level, 1 level of nesting, and field removal
    HashSet<String> fieldLookup = null;
    if (null != fields) {
        fieldLookup = new HashSet<String>();
        String[] fieldArray = fields.split(",");
        for (String field : fieldArray) {
            String[] fieldDecomp = field.split(":");
            fieldLookup.add(fieldDecomp[0]);
        }//from  ww  w  . ja v  a2  s  .c  o m
    } //TOTEST

    int nRecords = 0;
    for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) {
        BasicDBObject element = new BasicDBObject();

        // KEY

        Writable key = record.getFirst();
        if (key instanceof org.apache.hadoop.io.Text) {
            org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key;
            element.put("key", writable.toString());
        } else if (key instanceof org.apache.hadoop.io.DoubleWritable) {
            org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key;
            element.put("key", Double.toString(writable.get()));
        } else if (key instanceof org.apache.hadoop.io.IntWritable) {
            org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key;
            element.put("key", Integer.toString(writable.get()));
        } else if (key instanceof org.apache.hadoop.io.LongWritable) {
            org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key;
            element.put("key", Long.toString(writable.get()));
        } else if (key instanceof BSONWritable) {
            element.put("key", MongoDbUtil.convert((BSONWritable) key));
        }

        // VALUE

        Writable value = record.getSecond();
        if (value instanceof org.apache.hadoop.io.Text) {
            org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value;
            element.put("value", writable.toString());
        } else if (value instanceof org.apache.hadoop.io.DoubleWritable) {
            org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value;
            element.put("value", Double.toString(writable.get()));
        } else if (value instanceof org.apache.hadoop.io.IntWritable) {
            org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value;
            element.put("value", Integer.toString(writable.get()));
        } else if (value instanceof org.apache.hadoop.io.LongWritable) {
            org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value;
            element.put("value", Long.toString(writable.get()));
        } else if (value instanceof BSONWritable) {
            element.put("value", MongoDbUtil.convert((BSONWritable) value));
        } else if (value instanceof org.apache.mahout.math.VectorWritable) {
            Vector vec = ((org.apache.mahout.math.VectorWritable) value).get();
            BasicDBList dbl2 = listFromMahoutVector(vec, "value", element);
            element.put("value", dbl2);
        } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) {
            org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value;
            element.put("valueWeight", vecW.getWeight());
            BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element);
            element.put("value", dbl2);
        } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) {
            Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue();
            BasicDBObject clusterVal = new BasicDBObject();
            clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal));
            clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal));
            element.put("value", clusterVal);
        } else {
            element.put("unknownValue", value.getClass().toString());
        }

        // Check the fields settings:
        // Only handle a few...
        if (null != fieldLookup) {
            for (String fieldToRemove : fieldLookup) {
                if (fieldToRemove.startsWith("value.")) {
                    fieldToRemove = fieldToRemove.substring(6);
                    BasicDBObject nested = (BasicDBObject) element.get("value.");
                    if (null != nested) {
                        nested.remove(fieldToRemove);
                    }
                } else {
                    element.remove(fieldToRemove);
                }
            } //TOTEST
        }

        dbl.add(element);
        nRecords++;
        if ((nLimit > 0) && (nRecords >= nLimit)) {
            break;
        }
    }

    return dbl;
}

From source file:com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchHadoopUtils.java

License:Apache License

public static void handleElasticsearchInput(CustomMapReduceJobPojo job, Configuration config,
        BasicDBObject advancedConfigurationDbo) {
    // Pull out type list:
    Object o = advancedConfigurationDbo.remove("$types");
    String[] types = null;//from  w  ww  .j av  a 2s  .co m
    if (null != o) {
        if (o instanceof BasicDBList) {
            types = ((BasicDBList) o).toArray(new String[0]);
        } else if (o instanceof String) {
            types = ((String) o).split("\\s*,\\s*");
        }
    } //TESTED (by hand)            

    //QUERY:

    // Date override:
    Date fromOverride = null;
    Date toOverride = null;
    Object fromOverrideObj = advancedConfigurationDbo.remove("$tmin");
    Object toOverrideObj = advancedConfigurationDbo.remove("$tmax");
    if (null != fromOverrideObj) {
        fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true);
    }
    if (null != toOverrideObj) {
        toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false);
    }
    Boolean streaming = null;
    Object streamingObj = advancedConfigurationDbo.remove("$streaming");
    if (streamingObj instanceof Boolean) {
        streaming = (Boolean) streamingObj;
    }

    //DEBUG
    //System.out.println("QUERY = " + advancedConfigurationDbo.toString());

    BasicDBObject newQuery = new BasicDBObject();
    Object queryObj = advancedConfigurationDbo.get("query");
    if (queryObj instanceof String) {
        config.set("es.query", queryObj.toString()); // URL version)         
        if ((null != fromOverride) || (null != toOverride)) {
            throw new RuntimeException(
                    "Can't specify $tmin/$tmax shortcut in conjunction with 'URL' query type");
        } //TESTED
    } else if (null != queryObj) {
        newQuery.put("query", queryObj);
        Object filterObj = advancedConfigurationDbo.get("filter");
        if (null != filterObj)
            newQuery.put("filter", filterObj); // (doesn't matter if it doesn't exist)
        Object fieldsObj = advancedConfigurationDbo.get("fields");
        if (null != fieldsObj)
            newQuery.put("fields", fieldsObj); // (doesn't matter if it doesn't exist)
        Object sizeObj = advancedConfigurationDbo.get("size");
        if (null != sizeObj)
            newQuery.put("size", sizeObj); // (doesn't matter if it doesn't exist)

        if ((null != fromOverride) || (null != toOverride)) {
            if (null == filterObj) {
                BasicDBObject filterRangeParamsDbo = new BasicDBObject();
                if (null != fromOverride) {
                    filterRangeParamsDbo.put("gte", fromOverride.getTime());
                }
                if (null != toOverride) {
                    filterRangeParamsDbo.put("lte", toOverride.getTime());
                }
                BasicDBObject filterRangeDbo = new BasicDBObject("@timestamp", filterRangeParamsDbo);
                BasicDBObject filterDbo = new BasicDBObject("range", filterRangeDbo);
                newQuery.put("filter", filterDbo);
            } else { // combine filter
                throw new RuntimeException(
                        "Can't (currently) specify $tmin/$tmax shortcut in conjunction with filter");
            } //TESTED            
        }

        config.set("es.query", newQuery.toString());
    }
    //(else no query == match all)

    //COMMUNITIES

    Pattern dateRegex = null;
    ThreadSafeSimpleDateFormat tssdf = null;
    if ((null != fromOverride) || (null != toOverride)) {
        dateRegex = Pattern.compile("[0-9]{4}[.][0-9]{2}[.][0-9]{2}");
        tssdf = new ThreadSafeSimpleDateFormat("yyyy.MM.dd");
    } //TESTED

    StringBuffer overallIndexNames = new StringBuffer();
    for (ObjectId commId : job.communityIds) {
        StringBuffer indexNames = new StringBuffer();
        //TODO (INF-2641): need to handle:
        //c) anyway to sub-query?! (look for communityIds term?!)

        if (null == streaming) {
            indexNames.append("recs_*").append(commId.toString()).append("*");
        } else if (streaming) {
            indexNames.append("recs_t_").append(commId.toString()).append("*");
        } else {// !streaming
            indexNames.append("recs_").append(commId.toString());
        } //TESTED

        StringBuffer decomposedIndexes = new StringBuffer();
        boolean needDecomposedIndexes = false;

        HashSet<String> typesAdded = new HashSet<String>();
        if ((null != types) && (null == fromOverride) && (null == toOverride)) { // (types manual, no date filtering - can be much simpler)
            for (String s : types)
                typesAdded.add(s);
        } else {
            // (All this oddly written code is to minimize the number of es types that get exposed, because
            //  they are really badly behaved in terms of bw compatbility)

            if (null != types) {
                for (String s : types)
                    typesAdded.add(s);
            }

            ElasticSearchManager indexMgr = ElasticSearchManager.getIndex("doc_dummy"); // (index guaranteed to exist)
            Object[] indexMetaObj = indexMgr.getRawClient().admin().cluster().prepareState()
                    .setIndices(indexNames.toString()).setRoutingTable(false).setNodes(false)
                    .setListenerThreaded(false).get().getState().getMetaData().getIndices().values().toArray();

            if (null != indexMetaObj)
                for (Object oo : indexMetaObj) {
                    IndexMetaData indexMeta = (IndexMetaData) oo;
                    String indexName = indexMeta.getIndex();

                    if ((null != fromOverride) || (null != toOverride)) {
                        //DEBUG
                        //System.out.println("INDEX: " + indexName);                  

                        Matcher m = dateRegex.matcher(indexName);
                        if (m.find()) {
                            try {
                                Date d = tssdf.parse(m.group());
                                long endpoint = d.getTime() + 24L * 3600L * 1000L - 1;
                                //DEBUG
                                //System.out.println("***************** COMPARE: " + d + " FROM " + fromOverride + " TO " + toOverride + "..errr . " + m.group());

                                if (null != fromOverride) {
                                    if (endpoint < fromOverride.getTime()) { // no overlap on the left
                                        needDecomposedIndexes = true;
                                        continue;
                                    }
                                } //TESTED
                                if (null != toOverride) {
                                    if (d.getTime() > toOverride.getTime()) { // no overlap on the right
                                        needDecomposedIndexes = true;
                                        continue;
                                    }
                                } //TESTED

                            } catch (ParseException e) {
                                // just carry on, odd index name, it happens
                                needDecomposedIndexes = true;
                                continue;
                            }
                        }
                    } //TESTED (end loop over time checking)

                    if (null == types) {
                        Iterator<String> typesIt = indexMeta.getMappings().keysIt();
                        while (typesIt.hasNext()) {
                            String type = typesIt.next();
                            if (!type.equals("_default_")) {
                                typesAdded.add(type);
                            }
                        }
                    }
                    if (0 != decomposedIndexes.length()) {
                        decomposedIndexes.append(',');
                    }
                    decomposedIndexes.append(indexName);

                } //(end loop over indexes)
        } //(end if need to derive the types from the indexes)                

        if (needDecomposedIndexes) { // (because we filtered some indexes out)
            indexNames = decomposedIndexes;
        }
        if (0 == indexNames.length()) {
            continue; // nothing to do here...
        }

        int numTypesAdded = 0;
        if (typesAdded.isEmpty()) { // there doesn't seem to be any types associated with this set of indexes
            continue; // (ie don't add)
        } else
            for (String type : typesAdded) {
                if (numTypesAdded > 0) {
                    indexNames.append(",");
                } else {
                    indexNames.append("/");
                }
                numTypesAdded++;
                indexNames.append(type);
            }

        if (overallIndexNames.length() > 0) {
            overallIndexNames.append(",,");
        }
        overallIndexNames.append(indexNames);

    } //(end loop over community)
      //TESTED (by hand)

    if (0 == overallIndexNames.length()) {
        throw new RuntimeException(
                "Communities contained no types, either all indexes empty, or index is corrupt");
    } //TESTED (by hand)

    //DEBUG
    //System.out.println("INDEXES = " + overallIndexNames.toString());

    config.set("es.resource", overallIndexNames.toString());
    config.set("es.index.read.missing.as.empty", "yes");

    //proxy if running in debug mode:
    if (InfiniteEsInputFormat.LOCAL_DEBUG_MODE) {
        config.set("es.net.proxy.http.host", "localhost");
        config.set("es.net.proxy.http.port", "8888");
    } //TESTED (by hand)            

}

From source file:com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager.java

License:Open Source License

public static void updateDocEntitiesFromDeletedDocuments(String uuid) {
    String outCollection = new StringBuilder(uuid).append("_AggregationUtils").toString();
    try {//from w  w w.  j a v a 2 s  . c  o  m
        PropertiesManager props = new PropertiesManager();
        if (props.getAggregationDisabled()) { // (no need to do this)
            return;
        }
        DBCollection outColl = DbManager.getDB("doc_metadata").getCollection(outCollection);

        DBCursor dbc = outColl.find();
        for (DBObject dbo : dbc) {
            BasicDBObject entityEl = (BasicDBObject) dbo;
            BasicDBObject entityVal = (BasicDBObject) entityEl.get("value");

            long nDocDecrement = entityVal.getLong("dc");
            long nFreqDecrement = entityVal.getLong("f");
            long nCurrFreq = entityVal.getLong("tf");
            long nCurrDocCount = entityVal.getLong("tdc");

            // (These are by construction the lowest values so this will provide some defence against going -ve)
            if (nDocDecrement > nCurrDocCount) {
                nDocDecrement = nCurrDocCount;
            }
            if (nFreqDecrement > nCurrFreq) {
                nFreqDecrement = nCurrFreq;
            }

            BasicDBObject entityId = (BasicDBObject) entityEl.get("_id");
            ObjectId commId = null;
            Object commObj = entityId.get("comm");
            if (commObj instanceof ObjectId) {
                commId = entityId.getObjectId("comm");
            }
            String index = (String) entityId.get("index");
            if ((null == index) || (null == commId)) {
                continue; // random error
            }

            BasicDBObject updateQuery = new BasicDBObject(EntityFeaturePojo.index_, index);
            updateQuery.put(EntityFeaturePojo.communityId_, commId);
            BasicDBObject entityUpdate1 = new BasicDBObject(EntityFeaturePojo.doccount_, -nDocDecrement);
            entityUpdate1.put(EntityFeaturePojo.totalfreq_, -nFreqDecrement);
            BasicDBObject entityUpdate = new BasicDBObject(DbManager.inc_, entityUpdate1);

            if (_diagnosticMode) {
                if (_logInDiagnosticMode)
                    System.out.println("UPDATE FEATURE DATABASE: " + updateQuery.toString() + "/"
                            + entityUpdate.toString());
            } else {
                DbManager.getFeature().getEntity().update(updateQuery, entityUpdate);
                // (can be a single query because the query is on index, the shard)
            }
            //TESTED

            if ((nDocDecrement < nCurrDocCount) && (nDocDecrement * 10 > nCurrDocCount)) {
                // ie there are some documents left
                // and the doc count has shifted by more than 10%
                BasicDBObject updateQuery2 = new BasicDBObject(EntityPojo.docQuery_index_, index);
                updateQuery2.put(DocumentPojo.communityId_, commId);
                BasicDBObject entityUpdate2_1 = new BasicDBObject(EntityPojo.docUpdate_doccount_,
                        nCurrDocCount - nDocDecrement);
                entityUpdate2_1.put(EntityPojo.docUpdate_totalfrequency_, nCurrFreq - nFreqDecrement);
                BasicDBObject entityUpdate2 = new BasicDBObject(DbManager.set_, entityUpdate2_1);

                if (_diagnosticMode) {
                    if (_logInDiagnosticMode)
                        System.out.println("UPDATE DOC DATABASE: " + updateQuery2.toString() + "/"
                                + entityUpdate2.toString());
                } else {
                    DbManager.getDocument().getMetadata().update(updateQuery2, entityUpdate2, false, true);
                }
            }
        } //TESTED (including when to update logic above)
    } catch (Exception e) {
        e.printStackTrace();
    }

    // Tidy up
    DbManager.getDB("doc_metadata").getCollection(outCollection).drop();
}

From source file:com.ikanow.infinit.e.processing.generic.GenericProcessingController.java

License:Open Source License

public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature,
        boolean bRebuildDocsIndex) {

    try { //create elasticsearch indexes

        if (!ElasticSearchManager.pingIndex(null, null)) {
            throw new RuntimeException("Index is red, disable indexing operations");
        } //TESTED

        PropertiesManager pm = new PropertiesManager();

        if (!pm.getAggregationDisabled()) {

            boolean languageNormalization = pm.getNormalizeEncoding();

            Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
            localSettingsEvent.put("number_of_shards", 10).put("number_of_replicas", 2);
            localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
            if (languageNormalization) {
                localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer",
                        "icu_folding", "standard", "lowercase");
            } else {
                localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard",
                        "lowercase");
            }/*from  w  ww.  jav a2s  .co m*/

            Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
            localSettingsGaz.put("number_of_shards", 10).put("number_of_replicas", 2);
            localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
            if (languageNormalization) {
                localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer",
                        "icu_folding", "standard", "lowercase");
            } else {
                localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard",
                        "lowercase");
            }

            //event feature
            String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
                    AssociationFeaturePojoIndexMap.Mapping.class);
            ElasticSearchManager eventIndex = IndexManager.createIndex(
                    AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping,
                    localSettingsEvent);
            if (null == eventIndex) { // (if has been previously referenced in this process space)
                eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
            }
            eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_);
            if (bDeleteEventFeature) {
                eventIndex.deleteMe();
                eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false,
                        null, eventGazMapping, localSettingsEvent);
            }
            //entity feature
            String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
                    EntityFeaturePojoIndexMap.Mapping.class);
            ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_,
                    null, false, null, gazMapping, localSettingsGaz);
            if (null == entityIndex) { // (if has been previously referenced in this process space)
                entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
            }
            entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_);
            if (bDeleteEntityFeature) {
                entityIndex.deleteMe();
                entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null,
                        gazMapping, localSettingsGaz);
            }
        }

        //DOCS - much more complicated than anything else 

        boolean bPingMainIndexFailed = !ElasticSearchManager
                .pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
        // (ie if main doc index doesn't exist then always rebuild all indexes)

        if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing...
            try {
                Thread.sleep(60000);
            } catch (Exception e) {
            }
            bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
        }
        bRebuildDocsIndex |= bPingMainIndexFailed;

        // check the main index has the "collection" alias - if not then rebuild everything

        if (!bPingMainIndexFailed && (null == _aliasInfo)) {
            ElasticSearchManager docIndex = ElasticSearchManager
                    .getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
            ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster()
                    .state(new ClusterStateRequest()).actionGet();
            _aliasInfo = CrossVersionImmutableMapOfImmutableMaps
                    .getAliases(clusterState.getState().getMetaData());
            if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) {
                bRebuildDocsIndex = true;
            }
        } //TESTED

        createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
        createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);

        // Some hardwired dummy communities
        createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
        createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
        // (create dummy index used to keep personal group aliases)

        if (bRebuildDocsIndex || bDeleteDocs) {

            // OK, going to have different shards for different communities:
            // Get a list of all the communities:

            BasicDBObject query = new BasicDBObject();
            BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
            fieldsToDrop.put("communityAttributes", 0);
            fieldsToDrop.put("userAttributes", 0);
            DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);

            List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
            int i = 0;
            System.out.println("Initializing " + dbc.size() + " indexes:");
            for (int j = 0; j < 2; ++j) {
                for (DBObject dbotmp : tmparray) {
                    if ((++i % 100) == 0) {
                        System.out.println("Initialized " + i + " indexes.");
                    }
                    BasicDBObject dbo = (BasicDBObject) dbotmp;

                    // OK, going to see if there are any sources with this group id, create a new index if so:
                    // (Don't use CommunityPojo data model here for performance reasons....
                    //  (Also, haven't gotten round to porting CommunityPojo field access to using static fields))
                    ObjectId communityId = (ObjectId) dbo.get("_id");
                    boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
                    boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
                    ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");

                    createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup,
                            bSystemGroup, bDeleteDocs, j == 0);

                } //end loop over communities
            } // end loop over communities - first time parents only
        } // (end if need to do big loop over all sources)
    } catch (Exception e) {
        //DEBUG
        //e.printStackTrace();

        throw new RuntimeException(e.getMessage());
    }
}

From source file:com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager.java

License:Open Source License

/**
 * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion)
 * @param col//from w w  w  .  j  a va2  s. c  o m
 * @param doc - needs  url, sourceKey set
 * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance
 * 
 * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS 
 *                removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
 *                   MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
 *                   processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
 *                                     DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
 *                   pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX
 *                      updateHarvestStatus(...)
 */
private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields,
        boolean bDeleteContent) {

    // 1] Create the query to soft delete the document

    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, doc.getUrl());
    query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm(doc.getSourceKey()));

    // 2] Delete the content if needed

    if (bDeleteContent) {
        if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            if (!_diagnosticMode) {
                DbManager.getDocument().getContent().remove(query);
            } else {
                System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: "
                        + doc.getSourceKey() + "/" + doc.getUrl());
            }
        }
    }
    //TESTED

    // 3] Work out which fields we have and which (if any we need to go and fetch):

    boolean needToFindAndModify = false;

    if (null == doc.getId()) { // This is called from processDocuments

        if (null != doc.getUpdateId()) { // update case...
            doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good)

            // (doc.index is populated but may not be correct because of the "many geos" workaround):
            if (DocumentPojoIndexMap.hasManyGeos(doc)) {
                doc.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
                // (note this check isn't stateless, it actually populates "locs" at the same time
                //  this is handled in addToDatastore (update case), temp removed when adding to DB
            } //TESTED (2.1.2, diagnostic mode, doc2)
        } else { // Not an update case, we're going to have to grab the document after all, which is a bit slower
            needToFindAndModify = true;
        }
    } //TESTED (2.1.2, diagnostic mode, doc2)
    if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore
        if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc)
            doc.setCreated(new Date(doc.getUpdateId().getTime()));
        } //TESTED (2.1.2, diagnostic mode, doc2)               
    }
    // (if we're here and index is not set, then it is intended to be null)

    // 4] Update the doc_metadata collection

    BasicDBObject softDelete = getSoftDeleteUpdate();
    BasicDBObject deadDoc = null; // (not normally needed)

    if (needToFindAndModify) { // less pleasant, need to go grab the doc
        deadDoc = (BasicDBObject) col.findOne(query, fields);
    } //TESTED (2.1.2)

    if (!_diagnosticMode) {
        col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons)         
    } //TESTED (2.1.2)

    // 5] Add fields if necessary

    if (null != deadDoc) {
        doc.setCreated((Date) deadDoc.get(DocumentPojo.created_));
        // (if getting this doc anyway then might as well get the created)
        doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_));
        doc.setIndex((String) deadDoc.get(DocumentPojo.index_));

        if (_diagnosticMode) {
            System.out
                    .println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString());
        }
    } //TESTED (2.1.2)
    else if (_diagnosticMode) {
        if (!needToFindAndModify) {
            System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted "
                    + doc.toDb().toString());
        } else {
            System.out.println(
                    "StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString());
        }
    } //TESTED (2.1.2)
}

From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 

    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex("association_index");
    //elasticManager.deleteMe();

    // Create the index if necessary
    String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
            AssociationFeaturePojoIndexMap.Mapping.class);
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping,
            localSettings);//ww  w .ja v a 2  s. c o m

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex("association_index");
    }

    // Now query the DB:

    DBCursor dbc = null;
    dbc = eventFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>();

    int nSynced = 0;

    // Loop over array and invoke the cleansing function for each one
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class);

        // If this table has just been rebuilt from the document then the indexes are all wrong ...
        // recalculate and save
        if ('#' == evt.getIndex().charAt(0)) {
            AssociationPojo singleEvt = new AssociationPojo();
            singleEvt.setEntity1_index(evt.getEntity1_index());
            singleEvt.setEntity2_index(evt.getEntity2_index());
            singleEvt.setVerb_category(evt.getVerb_category());
            singleEvt.setGeo_index(evt.getGeo_index());
            evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt));
            eventFeatureDB
                    .update(new BasicDBObject("_id", dbo.get("_id")),
                            new BasicDBObject(MongoDbManager.set_,
                                    new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())),
                            false, true);
            // (has to be a multi-update even though it's unique because it's sharded on index)
        }

        // Handle groups (system group is: "4c927585d591d31d7b37097a")
        if (null == evt.getCommunityId()) {
            evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
        }
        // Bulk add prep
        events.add(evt);
        nSynced++;

        if (events.size() > 1000) {
            elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events,
                    AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null,
                    true);
            events.clear();
        }
    }
    // End loop over entities

    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(),
            new AssociationFeaturePojoIndexMap()), "_id", null, true);

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }
}

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk)
        throws IOException {
    PropertiesManager pm = new PropertiesManager();
    int nMaxContentSize_bytes = pm.getMaxContentSize();

    // Initialize the DB:

    DBCollection docsDB = DbManager.getDocument().getMetadata();
    DBCollection contentDB = DbManager.getDocument().getContent();
    DBCollection sourcesDB = DbManager.getIngest().getSource();

    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // 1. Get the documents from the DB (combining data + metadata and refreshing source meta)

    // (Ignore soft-deleted records:)
    if (null == query) {
        query = new BasicDBObject();
    }/*from ww  w .  j  a v  a 2 s.  c o  m*/
    Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException(
                    "Can't specify sourceKey as part of complex query term: " + query.toString());
        } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        if (sourceKeyQueryTerm instanceof String) {
            query.put(DocumentPojo.sourceKey_,
                    SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm));
        } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }")
        else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution
            BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
            fields.put(SourcePojo.highestDistributionFactorStored_, 1);
            DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields);
            LinkedList<String> sourceKeys = new LinkedList<String>();
            for (DBObject dbo : dbc) {
                String key = (String) dbo.get(SourcePojo.key_);
                Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor);
                sourceKeys.addAll(sourceKeysForSource);
            }
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
        } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }")
        else {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //(actually not possible, just included here for mathematical completeness...)         
    } else {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        // Optimize communityId into sourceKeys...
        if (null != query.get(DocumentPojo.communityId_)) {
            try {
                ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
                BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
                fields.put(SourcePojo.highestDistributionFactorStored_, 1);
                DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields);
                LinkedList<String> sourceKeys = new LinkedList<String>();
                int added = 0;
                for (DBObject dbo : dbc) {
                    String key = (String) dbo.get(SourcePojo.key_);
                    Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                    Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key,
                            distributionFactor);
                    sourceKeys.addAll(sourceKeysForSource);
                    added += sourceKeysForSource.size();
                }
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));

                System.out.println("(Optimized simple community query to " + added + " source key(s))");
            } catch (Exception e) {
                //DEBUG
                //e.printStackTrace();

                System.out.println("(Can't optimize complex community query: " + e.getMessage());
            }
        } //TESTED (by hand - including distributed source version)
    }
    // Ignored delete objects
    Object urlQuery = query.get(DocumentPojo.url_);
    if (null == urlQuery) {
        query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
    } //TESTED
    else if (urlQuery instanceof BasicDBObject) {
        ((BasicDBObject) urlQuery).append("$regex", "^[^?]");
    } //TESTED
      //DEBUG
      //System.out.println("COMBINED QUERY= " + query.toString());

    // If aggregating, kick off the background aggregation thread
    if (bAggregate) {
        EntityBackgroundAggregationManager.startThread();
        AssociationBackgroundAggregationManager.startThread();
    }

    //Debug:
    DBCursor dbc = null;
    dbc = docsDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    byte[] storageArray = new byte[200000];

    int nSynced = 0;
    LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
    Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
    ObjectId currCommunityId = null;
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        String sDocIndex = doc.getIndex();
        if (null == sDocIndex) {
            sDocIndex = "document_index";
        }
        if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
            _deletedIndex.add(sDocIndex);
            rebuildIndex(sDocIndex);
            try { // (Just in case the index requires some time to sort itself out)
                Thread.sleep(1000);
            } catch (InterruptedException e) {
            }
        }

        //Debug:
        //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());

        // Get the content:
        if ((0 != nMaxContentSize_bytes)
                && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_,
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
            fields.put(CompressedFullTextPojo.sourceKey_, 1);

            DBCursor dbcGzip = contentDB.find(contentQ, fields);
            while (dbcGzip.hasNext()) {
                BasicDBObject dboContent = (BasicDBObject) dbcGzip.next();
                if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) {
                    // If this has another version then ignore this one...
                    if (dbc.hasNext()) {
                        continue;
                    } //TESTED (by hand)               
                }

                byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = 0;
                StringBuffer output = new StringBuffer();
                while (nRead >= 0) {
                    nRead = gzip.read(storageArray, 0, 200000);
                    if (nRead > 0) {
                        String s = new String(storageArray, 0, nRead, "UTF-8");
                        output.append(s);
                    }
                }
                doc.setFullText(output.toString());
            }
        }
        // (else document has full text already)

        // Get tags, if necessary:
        // Always overwrite tags - one of the reasons we might choose to migrate
        // Also may need source in order to support source index filtering
        SourcePojo src = _sourceCache.get(doc.getSourceKey());
        if (null == src) {
            //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
            BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                    .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
            if (null != srcDbo) {
                src = SourcePojo.fromDb(srcDbo, SourcePojo.class);

                if (null != src.getProcessingPipeline()) {
                    try {
                        // Set the index settings
                        HarvestController hc = new HarvestController();
                        HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
                        hcPipe.extractSource_preProcessingPipeline(src, hc);
                    } catch (Exception e) {
                        //DEBUG
                        e.printStackTrace();
                    }
                } //TESTED (by hand)

                _sourceCache.put(doc.getSourceKey(), src);
            }
        }
        doc.setTempSource(src); // (needed for source index filtering)
        if (null != src) {
            if (null != src.getTags()) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();
                    tagsTidied.add(ss);
                }

                // May also want to write this back to the DB:
                //TODO (INF-2223): Handle append tags or not in the pipeline...
                if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
                    if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
                        BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                                doc.getRawSourceKey()); // (ie including the # if there is one)
                        updateQuery.put(DocumentPojo._id_, doc.getId());
                        docsDB.update(updateQuery,
                                new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_,
                                        new BasicDBObject(DbManager.each_, tagsTidied))));
                    }
                    doc.setTags(tagsTidied); // (just copy ptr across)
                }
            }
        }

        // 2. Update the index with the new document            

        // (Optionally also update entity and assoc features)

        if (bAggregate) {
            if (null == currCommunityId) {
                currCommunityId = doc.getCommunityId();
            } else if (!currCommunityId.equals(doc.getCommunityId())) {
                LinkedList<DocumentPojo> perCommunityDocList = null;
                if (null == communityList) { // (very first time we see > 1 community)
                    communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                currCommunityId = doc.getCommunityId();
                perCommunityDocList = communityList.get(currCommunityId);
                if (null == perCommunityDocList) {
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    communityList.put(currCommunityId, perCommunityDocList);
                }
                perCommunityDocList.add(doc);
            }
        } //TESTED

        nSynced++;
        docsToTransfer.add(doc);
        if (0 == (nSynced % 10000)) {
            StoreAndIndexManager manager = new StoreAndIndexManager();

            if (bAggregate) {
                // Loop over communities and aggregate each one then store the modified entities/assocs               
                doAggregation(communityList, docsToTransfer);
                communityList = null; // (in case the next 10,000 docs are all in the same community!)
                currCommunityId = null;

            } //TOTEST            

            manager.addToSearch(docsToTransfer);
            docsToTransfer.clear();
            System.out.println("(Synced " + nSynced + " records)");
        }

    } // (End loop over docs)

    // Sync remaining docs

    if (!docsToTransfer.isEmpty()) {
        if (bAggregate) {
            // Loop over communities and aggregate each one then store the modified entities/assocs               
            doAggregation(communityList, docsToTransfer);
        }

        StoreAndIndexManager manager = new StoreAndIndexManager();
        manager.addToSearch(docsToTransfer);
    }

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }

    if (bAggregate) {
        System.out.println("Completed. You can hit CTRL+C at any time.");
        System.out.println(
                "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
        try {
            Thread.sleep(300000);
        } catch (InterruptedException e) {
        }

        // Turn off so we can exit
        EntityBackgroundAggregationManager.stopThreadAndWait();
        AssociationBackgroundAggregationManager.stopThreadAndWait();
    }
}

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

@SuppressWarnings("unused")
private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort,
        BasicDBObject query, int nLimit) {
    ElasticSearchManager elasticManager = null;

    try {/*from ww  w  . jav a2  s.  co m*/
        // Initialize the DB:

        DBCollection feedsDB = DbManager.getDocument().getMetadata();
        DBCollection contentDB = DbManager.getDocument().getContent();
        DBCollection sourcesDB = DbManager.getIngest().getSource();

        String indexName = "document_index";

        // Test/debug recreate the index
        if (true) {

            // (delete the index)
            System.out.println("Deleting index...");
            elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
            elasticManager.deleteMe();
            //(also deletes the child index - same index, different type)

            // Create the index if necessary
            String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(),
                    DocumentPojoIndexMap.Mapping.class);

            Builder localSettings = ImmutableSettings.settingsBuilder();
            localSettings.put("number_of_shards", 10).put("number_of_replicas", 2);

            System.out.println("Creating index..." + sMapping);
            elasticManager = ElasticSearchManager.createIndex(indexName, null, false,
                    sElasticHost + ":" + sElasticPort, sMapping, localSettings);

        }
        // Get the index (necessary if already created)
        if (null == elasticManager) {
            elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
        }

        // Get the feeds from the DB:

        //Debug:
        //         System.out.println("Querying DB...");

        DBCursor dbc = feedsDB.find(query).limit(nLimit);

        byte[] storageArray = new byte[200000];

        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);

            //Debug:
            System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl());

            // Get the content:
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_,
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ);
            if (null != dboContent) {
                byte[] compressedData = ((byte[]) dboContent.get("gzip_content"));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = gzip.read(storageArray, 0, 200000);
                String s = new String(storageArray, 0, nRead, "UTF-8");
                doc.setFullText(s);
            }
            // Get tag:
            SourcePojo src = _sourceCache.get(doc.getSourceKey());
            if (null == src) {
                BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                        .findOne(new BasicDBObject("key", doc.getSourceKey()));
                if (null != srcDbo) {
                    src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class);

                    _sourceCache.put(doc.getSourceKey(), src);
                }
            }
            if (null != src) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();
                    tagsTidied.add(ss);
                }
                doc.setTags(tagsTidied);
            }

            //TEST: set dynamic field
            // Lots of testing of dynamic dates:
            //            feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString()));
            //            String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated());            
            //            feed.addToMetadata("another_dateISO", s1);
            //            String s1_5 = new SimpleDateFormat().format(feed.getCreated());
            //            feed.addToMetadata("another_dateTimeJava", s1_5);
            //            String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated());            
            //            feed.addToMetadata("another_dateYYYYMMDD", s2);
            //            String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated());
            //            feed.addToMetadata("another_dateRFC822", s3);
            //            feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString());
            //            // Testing of the string field types
            //            feed.addToMetadata("my_comment", "Testing this ABCDEFG");            
            //            feed.addToMetadata("my_term", "Testing this UVWXYZ");
            //            feed.addToMetadata("my_text", "Testing this 123456");            
            //            // Test an array of longs:
            //            Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L;
            //            feed.addToMetadata("md_long", tl);

            //TEST: some dummy event timestamp adding code (not seeing much/any in the data)
            //            if (null != feed.getEvents()) {
            //               int i = 0;
            //               for (EventPojo evt: feed.getEvents()) {
            //                  //1: Add single date
            //                  if (0 == i) {
            //                     evt.time_start = "2011-01-01";
            //                  }
            //                  //2: Add short span
            //                  if (1 == i) {
            //                     evt.time_start = "2010-04-06";
            //                     evt.time_end = "2010-08-09";
            //                  }
            //                  //3: Add cross-yr span
            //                  if (2 == i) {
            //                     evt.time_start = "2012-06-05";
            //                     evt.time_end = "2013-09-05";
            //                  }
            //                  //4: Add too long span
            //                  if (3 == i) {
            //                     evt.time_start = "2012-04-06";
            //                     evt.time_end = "2014-04-09";
            //                  }
            //                  i++;
            //               }
            //            }

            // For event adding, see data_model.test.TestCode
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //nothing to do
    }
}

From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection entityFeatureDB = DbManager.getFeature().getEntity();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 

    String indexName = "entity_index";
    ElasticSearchManager.setDefaultClusterName("infinite-aws");

    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex(indexName);
    //elasticManager.deleteMe();

    // Create the index if necessary
    String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
            EntityFeaturePojoIndexMap.Mapping.class);
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings);

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex(indexName);
    }//from w w w.  j  av  a2  s. co  m

    // Now query the DB:

    DBCursor dbc = null;
    dbc = entityFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        }
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
        }
    }
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
        System.out.println(
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...
            return;
        }
    }

    int nSynced = 0;

    List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>();
    while (dbc.hasNext()) {
        EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class);

        if (null != feature.getAlias()) { // (some corrupt gazateer entry)

            // Handle groups (system group is: "4c927585d591d31d7b37097a")
            // if there is no community id, add system group (something is wrong if this happens?)
            if (null == feature.getCommunityId()) {
                feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
            }
        }

        entities.add(feature);
        nSynced++;

        // Add the entities
        if (entities.size() > 1000) {
            elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
                    new EntityFeaturePojoIndexMap()), "_id", null, true);
            // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

            entities = new ArrayList<EntityFeaturePojo>();
        }
    }
    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
            new EntityFeaturePojoIndexMap()), "_id", null, true);
    // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");
    }
}