From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java

License:Open Source License

private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable,
        String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer,
        String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue,
        String arguments, Boolean incrementalMode, ObjectId userId, Boolean selfMerge,
        String originalOutputCollection, Boolean appendResults) throws IOException {
    String dbserver = prop_general.getDatabaseServer();
    output = outputDatabase + "." + tempOutputCollection;

    boolean isAdmin = AuthUtils.isAdmin(userId);

    int nSplits = 8;
    int nDocsPerSplit = 12500;

    //add communities to query if this is not a custom table
    BasicDBObject oldQueryObj = null;
    BasicDBObject srcTags = null;// w ww  .  ja va 2s  .co m
    // Start with the old query:
    if (query.startsWith("{")) {
        oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query);
    } else {
        oldQueryObj = new BasicDBObject();
    boolean elasticsearchQuery = oldQueryObj.containsField("qt") && !isCustomTable;
    int nLimit = 0;
    if (oldQueryObj.containsField("$limit")) {
        nLimit = oldQueryObj.getInt("$limit");
    if (oldQueryObj.containsField("$splits")) {
        nSplits = oldQueryObj.getInt("$splits");
    if (oldQueryObj.containsField("$srctags")) {
        srcTags = new BasicDBObject(SourcePojo.tags_, oldQueryObj.get("$srctags"));
    if (bLocalMode) { // If in local mode, then set this to a large number so we always run inside our limit/split version
        // (since for some reason MongoInputFormat seems to fail on large collections)
        nSplits = InfiniteMongoSplitter.MAX_SPLITS;
    if (oldQueryObj.containsField("$docsPerSplit")) {
        nDocsPerSplit = oldQueryObj.getInt("$docsPerSplit");
    String mapperKeyClass = oldQueryObj.getString("$mapper_key_class", "");
    String mapperValueClass = oldQueryObj.getString("$mapper_value_class", "");
    String cacheList = null;
    Object cacheObj = oldQueryObj.get("$caches");
    if (null != cacheObj) {
        cacheList = cacheObj.toString(); // (either array of strings, or single string)
        if (!cacheList.startsWith("[")) {
            cacheList = "[" + cacheList + "]"; // ("must" now be valid array)
    } //TESTED

    if (null != nDebugLimit) { // (debug mode override)
        nLimit = nDebugLimit;
    boolean tmpIncMode = (null != incrementalMode) && incrementalMode;

    Date fromOverride = null;
    Date toOverride = null;
    Object fromOverrideObj = oldQueryObj.remove("$tmin");
    Object toOverrideObj = oldQueryObj.remove("$tmax");
    if (null != fromOverrideObj) {
        fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true);
    if (null != toOverrideObj) {
        toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false);

    if (!isCustomTable) {
        if (elasticsearchQuery) {
            oldQueryObj.put("communityIds", communityIds);
            //tmin/tmax not supported - already have that capability as part of the query
        } else {
            if (input.equals("feature.temporal")) {
                if ((null != fromOverride) || (null != toOverride)) {
                            InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, true));
                } //TESTED
                oldQueryObj.put("_id.c", new BasicDBObject(DbManager.in_, communityIds));
            } else {
                oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds));
                if ((null != fromOverride) || (null != toOverride)) {
                            InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false));
                } //TESTED         
                if (input.equals("doc_metadata.metadata")) {
                    oldQueryObj.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (ensures not soft-deleted)
    } else {
        if ((null != fromOverride) || (null != toOverride)) {
            oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false));
        } //TESTED
          //get the custom table (and database)
        input = CustomOutputManager.getCustomDbAndCollection(input);
    query = oldQueryObj.toString();

    if (arguments == null)
        arguments = "";

    // Generic configuration
    out.write("<?xml version=\"1.0\"?>\n<configuration>");

    // Mongo specific configuration
    out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title
            + "</value></property>"
            + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>"
            + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>"
            + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://"
            + dbserver + "/" + input + "</value></property>"
            + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://"
            + dbserver + "/" + output + "</value>  </property>"
            + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>"
            + StringEscapeUtils.escapeXml(query) + "</value></property>"
            + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>"
            + ((fields == null) ? ("") : fields) + "</value></property>"
            + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>"
            + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>"
            + nLimit + "</value><!-- 0 == no limit --></property>"
            + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>"
            + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper
            + "</value></property>"
            + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer
            + "</value></property>"
            + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>"
            + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat</value></property>"
            + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>"
            + outputKey + "</value></property>"
            + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>"
            + outputValue + "</value></property>"
            + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value>"
            + mapperKeyClass + "</value></property>"
            + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value>"
            + mapperValueClass + "</value></property>"
            + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>"
            + combiner + "</value></property>"
            + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>"
            + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>"
            + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>");

    // Infinit.e specific configuration

    out.write("\n\t<property><!-- User Arguments [optional] --><name>infinit.e.userid</name><value>"
            + StringEscapeUtils.escapeXml(userId.toString()) + "</value></property>"
            + "\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>"
            + StringEscapeUtils.escapeXml(arguments) + "</value></property>"
            + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>"
            + nSplits + "</value></property>"
            + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>"
            + nDocsPerSplit + "</value></property>"
            + "\n\t<property><!-- Infinit.e incremental mode [optional] --><name>update.incremental</name><value>"
            + tmpIncMode + "</value></property>"
            + "\n\t<property><!-- Infinit.e quick admin check [optional] --><name>infinit.e.is.admin</name><value>"
            + isAdmin + "</value></property>"
            + "\n\t<property><!-- Infinit.e userid [optional] --><name>infinit.e.userid</name><value>" + userId
            + "</value></property>");
    if (null != cacheList) {
                "\n\t<property><!-- Infinit.e cache list [optional] --><name>infinit.e.cache.list</name><value>"
                        + cacheList + "</value></property>");
    } //TESTED
    if (null != srcTags) {
                "\n\t<property><!-- Infinit.e src tags filter [optional] --><name>infinit.e.source.tags.filter</name><value>"
                        + srcTags.toString() + "</value></property>");

    if (null != selfMerge && selfMerge && originalOutputCollection != null) {
        originalOutputCollection = "mongodb://" + dbserver + "/" + outputDatabase + "."
                + originalOutputCollection;
                "\n\t<property><!-- This jobs output collection for passing into the mapper along with input collection [optional] --><name>infinit.e.selfMerge</name><value>"
                        + originalOutputCollection + "</value></property>");

    // Closing thoughts:


From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java

License:Open Source License

public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields)
        throws SAXException, IOException, ParserConfigurationException {

    BasicDBList dbl = new BasicDBList();

    PropertiesManager props = new PropertiesManager();
    Configuration conf = getConfiguration(props);

    Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);

    @SuppressWarnings({ "unchecked", "rawtypes" })
    SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable(
            pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf);

    // Very basic, only allow top level, 1 level of nesting, and field removal
    HashSet<String> fieldLookup = null;
    if (null != fields) {
        fieldLookup = new HashSet<String>();
        String[] fieldArray = fields.split(",");
        for (String field : fieldArray) {
            String[] fieldDecomp = field.split(":");
        }//from  ww  w  . ja v  a2  s  .c  o m
    } //TOTEST

    int nRecords = 0;
    for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) {
        BasicDBObject element = new BasicDBObject();

        // KEY

        Writable key = record.getFirst();
        if (key instanceof org.apache.hadoop.io.Text) {
            org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key;
            element.put("key", writable.toString());
        } else if (key instanceof org.apache.hadoop.io.DoubleWritable) {
            org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key;
            element.put("key", Double.toString(writable.get()));
        } else if (key instanceof org.apache.hadoop.io.IntWritable) {
            org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key;
            element.put("key", Integer.toString(writable.get()));
        } else if (key instanceof org.apache.hadoop.io.LongWritable) {
            org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key;
            element.put("key", Long.toString(writable.get()));
        } else if (key instanceof BSONWritable) {
            element.put("key", MongoDbUtil.convert((BSONWritable) key));

        // VALUE

        Writable value = record.getSecond();
        if (value instanceof org.apache.hadoop.io.Text) {
            org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value;
            element.put("value", writable.toString());
        } else if (value instanceof org.apache.hadoop.io.DoubleWritable) {
            org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value;
            element.put("value", Double.toString(writable.get()));
        } else if (value instanceof org.apache.hadoop.io.IntWritable) {
            org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value;
            element.put("value", Integer.toString(writable.get()));
        } else if (value instanceof org.apache.hadoop.io.LongWritable) {
            org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value;
            element.put("value", Long.toString(writable.get()));
        } else if (value instanceof BSONWritable) {
            element.put("value", MongoDbUtil.convert((BSONWritable) value));
        } else if (value instanceof org.apache.mahout.math.VectorWritable) {
            Vector vec = ((org.apache.mahout.math.VectorWritable) value).get();
            BasicDBList dbl2 = listFromMahoutVector(vec, "value", element);
            element.put("value", dbl2);
        } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) {
            org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value;
            element.put("valueWeight", vecW.getWeight());
            BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element);
            element.put("value", dbl2);
        } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) {
            Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue();
            BasicDBObject clusterVal = new BasicDBObject();
            clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal));
            clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal));
            element.put("value", clusterVal);
        } else {
            element.put("unknownValue", value.getClass().toString());

        // Check the fields settings:
        // Only handle a few...
        if (null != fieldLookup) {
            for (String fieldToRemove : fieldLookup) {
                if (fieldToRemove.startsWith("value.")) {
                    fieldToRemove = fieldToRemove.substring(6);
                    BasicDBObject nested = (BasicDBObject) element.get("value.");
                    if (null != nested) {
                } else {
            } //TOTEST

        if ((nLimit > 0) && (nRecords >= nLimit)) {

    return dbl;

From source file:com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchHadoopUtils.java

License:Apache License

public static void handleElasticsearchInput(CustomMapReduceJobPojo job, Configuration config,
        BasicDBObject advancedConfigurationDbo) {
    // Pull out type list:
    Object o = advancedConfigurationDbo.remove("$types");
    String[] types = null;//from  w  ww  .j av  a 2s  .co m
    if (null != o) {
        if (o instanceof BasicDBList) {
            types = ((BasicDBList) o).toArray(new String[0]);
        } else if (o instanceof String) {
            types = ((String) o).split("\\s*,\\s*");
    } //TESTED (by hand)            


    // Date override:
    Date fromOverride = null;
    Date toOverride = null;
    Object fromOverrideObj = advancedConfigurationDbo.remove("$tmin");
    Object toOverrideObj = advancedConfigurationDbo.remove("$tmax");
    if (null != fromOverrideObj) {
        fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true);
    if (null != toOverrideObj) {
        toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false);
    Boolean streaming = null;
    Object streamingObj = advancedConfigurationDbo.remove("$streaming");
    if (streamingObj instanceof Boolean) {
        streaming = (Boolean) streamingObj;

    //System.out.println("QUERY = " + advancedConfigurationDbo.toString());

    BasicDBObject newQuery = new BasicDBObject();
    Object queryObj = advancedConfigurationDbo.get("query");
    if (queryObj instanceof String) {
        config.set("es.query", queryObj.toString()); // URL version)         
        if ((null != fromOverride) || (null != toOverride)) {
            throw new RuntimeException(
                    "Can't specify $tmin/$tmax shortcut in conjunction with 'URL' query type");
        } //TESTED
    } else if (null != queryObj) {
        newQuery.put("query", queryObj);
        Object filterObj = advancedConfigurationDbo.get("filter");
        if (null != filterObj)
            newQuery.put("filter", filterObj); // (doesn't matter if it doesn't exist)
        Object fieldsObj = advancedConfigurationDbo.get("fields");
        if (null != fieldsObj)
            newQuery.put("fields", fieldsObj); // (doesn't matter if it doesn't exist)
        Object sizeObj = advancedConfigurationDbo.get("size");
        if (null != sizeObj)
            newQuery.put("size", sizeObj); // (doesn't matter if it doesn't exist)

        if ((null != fromOverride) || (null != toOverride)) {
            if (null == filterObj) {
                BasicDBObject filterRangeParamsDbo = new BasicDBObject();
                if (null != fromOverride) {
                    filterRangeParamsDbo.put("gte", fromOverride.getTime());
                if (null != toOverride) {
                    filterRangeParamsDbo.put("lte", toOverride.getTime());
                BasicDBObject filterRangeDbo = new BasicDBObject("@timestamp", filterRangeParamsDbo);
                BasicDBObject filterDbo = new BasicDBObject("range", filterRangeDbo);
                newQuery.put("filter", filterDbo);
            } else { // combine filter
                throw new RuntimeException(
                        "Can't (currently) specify $tmin/$tmax shortcut in conjunction with filter");
            } //TESTED            

        config.set("es.query", newQuery.toString());
    //(else no query == match all)


    Pattern dateRegex = null;
    ThreadSafeSimpleDateFormat tssdf = null;
    if ((null != fromOverride) || (null != toOverride)) {
        dateRegex = Pattern.compile("[0-9]{4}[.][0-9]{2}[.][0-9]{2}");
        tssdf = new ThreadSafeSimpleDateFormat("yyyy.MM.dd");
    } //TESTED

    StringBuffer overallIndexNames = new StringBuffer();
    for (ObjectId commId : job.communityIds) {
        StringBuffer indexNames = new StringBuffer();
        //TODO (INF-2641): need to handle:
        //c) anyway to sub-query?! (look for communityIds term?!)

        if (null == streaming) {
        } else if (streaming) {
        } else {// !streaming
        } //TESTED

        StringBuffer decomposedIndexes = new StringBuffer();
        boolean needDecomposedIndexes = false;

        HashSet<String> typesAdded = new HashSet<String>();
        if ((null != types) && (null == fromOverride) && (null == toOverride)) { // (types manual, no date filtering - can be much simpler)
            for (String s : types)
        } else {
            // (All this oddly written code is to minimize the number of es types that get exposed, because
            //  they are really badly behaved in terms of bw compatbility)

            if (null != types) {
                for (String s : types)

            ElasticSearchManager indexMgr = ElasticSearchManager.getIndex("doc_dummy"); // (index guaranteed to exist)
            Object[] indexMetaObj = indexMgr.getRawClient().admin().cluster().prepareState()

            if (null != indexMetaObj)
                for (Object oo : indexMetaObj) {
                    IndexMetaData indexMeta = (IndexMetaData) oo;
                    String indexName = indexMeta.getIndex();

                    if ((null != fromOverride) || (null != toOverride)) {
                        //System.out.println("INDEX: " + indexName);                  

                        Matcher m = dateRegex.matcher(indexName);
                        if (m.find()) {
                            try {
                                Date d = tssdf.parse(m.group());
                                long endpoint = d.getTime() + 24L * 3600L * 1000L - 1;
                                //System.out.println("***************** COMPARE: " + d + " FROM " + fromOverride + " TO " + toOverride + "..errr . " + m.group());

                                if (null != fromOverride) {
                                    if (endpoint < fromOverride.getTime()) { // no overlap on the left
                                        needDecomposedIndexes = true;
                                } //TESTED
                                if (null != toOverride) {
                                    if (d.getTime() > toOverride.getTime()) { // no overlap on the right
                                        needDecomposedIndexes = true;
                                } //TESTED

                            } catch (ParseException e) {
                                // just carry on, odd index name, it happens
                                needDecomposedIndexes = true;
                    } //TESTED (end loop over time checking)

                    if (null == types) {
                        Iterator<String> typesIt = indexMeta.getMappings().keysIt();
                        while (typesIt.hasNext()) {
                            String type = typesIt.next();
                            if (!type.equals("_default_")) {
                    if (0 != decomposedIndexes.length()) {

                } //(end loop over indexes)
        } //(end if need to derive the types from the indexes)                

        if (needDecomposedIndexes) { // (because we filtered some indexes out)
            indexNames = decomposedIndexes;
        if (0 == indexNames.length()) {
            continue; // nothing to do here...

        int numTypesAdded = 0;
        if (typesAdded.isEmpty()) { // there doesn't seem to be any types associated with this set of indexes
            continue; // (ie don't add)
        } else
            for (String type : typesAdded) {
                if (numTypesAdded > 0) {
                } else {

        if (overallIndexNames.length() > 0) {

    } //(end loop over community)
      //TESTED (by hand)

    if (0 == overallIndexNames.length()) {
        throw new RuntimeException(
                "Communities contained no types, either all indexes empty, or index is corrupt");
    } //TESTED (by hand)

    //System.out.println("INDEXES = " + overallIndexNames.toString());

    config.set("es.resource", overallIndexNames.toString());
    config.set("es.index.read.missing.as.empty", "yes");

    //proxy if running in debug mode:
    if (InfiniteEsInputFormat.LOCAL_DEBUG_MODE) {
        config.set("es.net.proxy.http.host", "localhost");
        config.set("es.net.proxy.http.port", "8888");
    } //TESTED (by hand)            


From source file:com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager.java

License:Open Source License

public static void updateDocEntitiesFromDeletedDocuments(String uuid) {
    String outCollection = new StringBuilder(uuid).append("_AggregationUtils").toString();
    try {//from w  w w.  j a v a 2 s  . c  o  m
        PropertiesManager props = new PropertiesManager();
        if (props.getAggregationDisabled()) { // (no need to do this)
        DBCollection outColl = DbManager.getDB("doc_metadata").getCollection(outCollection);

        DBCursor dbc = outColl.find();
        for (DBObject dbo : dbc) {
            BasicDBObject entityEl = (BasicDBObject) dbo;
            BasicDBObject entityVal = (BasicDBObject) entityEl.get("value");

            long nDocDecrement = entityVal.getLong("dc");
            long nFreqDecrement = entityVal.getLong("f");
            long nCurrFreq = entityVal.getLong("tf");
            long nCurrDocCount = entityVal.getLong("tdc");

            // (These are by construction the lowest values so this will provide some defence against going -ve)
            if (nDocDecrement > nCurrDocCount) {
                nDocDecrement = nCurrDocCount;
            if (nFreqDecrement > nCurrFreq) {
                nFreqDecrement = nCurrFreq;

            BasicDBObject entityId = (BasicDBObject) entityEl.get("_id");
            ObjectId commId = null;
            Object commObj = entityId.get("comm");
            if (commObj instanceof ObjectId) {
                commId = entityId.getObjectId("comm");
            String index = (String) entityId.get("index");
            if ((null == index) || (null == commId)) {
                continue; // random error

            BasicDBObject updateQuery = new BasicDBObject(EntityFeaturePojo.index_, index);
            updateQuery.put(EntityFeaturePojo.communityId_, commId);
            BasicDBObject entityUpdate1 = new BasicDBObject(EntityFeaturePojo.doccount_, -nDocDecrement);
            entityUpdate1.put(EntityFeaturePojo.totalfreq_, -nFreqDecrement);
            BasicDBObject entityUpdate = new BasicDBObject(DbManager.inc_, entityUpdate1);

            if (_diagnosticMode) {
                if (_logInDiagnosticMode)
                    System.out.println("UPDATE FEATURE DATABASE: " + updateQuery.toString() + "/"
                            + entityUpdate.toString());
            } else {
                DbManager.getFeature().getEntity().update(updateQuery, entityUpdate);
                // (can be a single query because the query is on index, the shard)

            if ((nDocDecrement < nCurrDocCount) && (nDocDecrement * 10 > nCurrDocCount)) {
                // ie there are some documents left
                // and the doc count has shifted by more than 10%
                BasicDBObject updateQuery2 = new BasicDBObject(EntityPojo.docQuery_index_, index);
                updateQuery2.put(DocumentPojo.communityId_, commId);
                BasicDBObject entityUpdate2_1 = new BasicDBObject(EntityPojo.docUpdate_doccount_,
                        nCurrDocCount - nDocDecrement);
                entityUpdate2_1.put(EntityPojo.docUpdate_totalfrequency_, nCurrFreq - nFreqDecrement);
                BasicDBObject entityUpdate2 = new BasicDBObject(DbManager.set_, entityUpdate2_1);

                if (_diagnosticMode) {
                    if (_logInDiagnosticMode)
                        System.out.println("UPDATE DOC DATABASE: " + updateQuery2.toString() + "/"
                                + entityUpdate2.toString());
                } else {
                    DbManager.getDocument().getMetadata().update(updateQuery2, entityUpdate2, false, true);
        } //TESTED (including when to update logic above)
    } catch (Exception e) {

    // Tidy up

From source file:com.ikanow.infinit.e.processing.generic.GenericProcessingController.java

License:Open Source License

public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature,
        boolean bRebuildDocsIndex) {

    try { //create elasticsearch indexes

        if (!ElasticSearchManager.pingIndex(null, null)) {
            throw new RuntimeException("Index is red, disable indexing operations");
        } //TESTED

        PropertiesManager pm = new PropertiesManager();

        if (!pm.getAggregationDisabled()) {

            boolean languageNormalization = pm.getNormalizeEncoding();

            Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
            localSettingsEvent.put("number_of_shards", 10).put("number_of_replicas", 2);
            localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
            if (languageNormalization) {
                localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer",
                        "icu_folding", "standard", "lowercase");
            } else {
                localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard",
            }/*from  w  ww.  jav a2s  .co m*/

            Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
            localSettingsGaz.put("number_of_shards", 10).put("number_of_replicas", 2);
            localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
            if (languageNormalization) {
                localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer",
                        "icu_folding", "standard", "lowercase");
            } else {
                localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard",

            //event feature
            String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
            ElasticSearchManager eventIndex = IndexManager.createIndex(
                    AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping,
            if (null == eventIndex) { // (if has been previously referenced in this process space)
                eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
            if (bDeleteEventFeature) {
                eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false,
                        null, eventGazMapping, localSettingsEvent);
            //entity feature
            String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
            ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_,
                    null, false, null, gazMapping, localSettingsGaz);
            if (null == entityIndex) { // (if has been previously referenced in this process space)
                entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
            if (bDeleteEntityFeature) {
                entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null,
                        gazMapping, localSettingsGaz);

        //DOCS - much more complicated than anything else 

        boolean bPingMainIndexFailed = !ElasticSearchManager
        // (ie if main doc index doesn't exist then always rebuild all indexes)

        if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing...
            try {
            } catch (Exception e) {
            bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
        bRebuildDocsIndex |= bPingMainIndexFailed;

        // check the main index has the "collection" alias - if not then rebuild everything

        if (!bPingMainIndexFailed && (null == _aliasInfo)) {
            ElasticSearchManager docIndex = ElasticSearchManager
            ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster()
                    .state(new ClusterStateRequest()).actionGet();
            _aliasInfo = CrossVersionImmutableMapOfImmutableMaps
            if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) {
                bRebuildDocsIndex = true;
        } //TESTED

        createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
        createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);

        // Some hardwired dummy communities
        createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
        createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
        // (create dummy index used to keep personal group aliases)

        if (bRebuildDocsIndex || bDeleteDocs) {

            // OK, going to have different shards for different communities:
            // Get a list of all the communities:

            BasicDBObject query = new BasicDBObject();
            BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
            fieldsToDrop.put("communityAttributes", 0);
            fieldsToDrop.put("userAttributes", 0);
            DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);

            List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
            int i = 0;
            System.out.println("Initializing " + dbc.size() + " indexes:");
            for (int j = 0; j < 2; ++j) {
                for (DBObject dbotmp : tmparray) {
                    if ((++i % 100) == 0) {
                        System.out.println("Initialized " + i + " indexes.");
                    BasicDBObject dbo = (BasicDBObject) dbotmp;

                    // OK, going to see if there are any sources with this group id, create a new index if so:
                    // (Don't use CommunityPojo data model here for performance reasons....
                    //  (Also, haven't gotten round to porting CommunityPojo field access to using static fields))
                    ObjectId communityId = (ObjectId) dbo.get("_id");
                    boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
                    boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
                    ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");

                    createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup,
                            bSystemGroup, bDeleteDocs, j == 0);

                } //end loop over communities
            } // end loop over communities - first time parents only
        } // (end if need to do big loop over all sources)
    } catch (Exception e) {

        throw new RuntimeException(e.getMessage());

From source file:com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager.java

License:Open Source License

 * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion)
 * @param col//from w w  w  .  j  a va2  s. c  o m
 * @param doc - needs  url, sourceKey set
 * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance
 * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS 
 *                removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE]
 *                   MongoDocumentTxfer.doDelete(...)  <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID
 *                   processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, 
 *                                     DON'T have _id, BUT do have updateId and index (correct except in many geo cases)]
 *                   pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX
 *                      updateHarvestStatus(...)
private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields,
        boolean bDeleteContent) {

    // 1] Create the query to soft delete the document

    BasicDBObject query = new BasicDBObject();
    query.put(DocumentPojo.url_, doc.getUrl());
    query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm(doc.getSourceKey()));

    // 2] Delete the content if needed

    if (bDeleteContent) {
        if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            if (!_diagnosticMode) {
            } else {
                System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: "
                        + doc.getSourceKey() + "/" + doc.getUrl());

    // 3] Work out which fields we have and which (if any we need to go and fetch):

    boolean needToFindAndModify = false;

    if (null == doc.getId()) { // This is called from processDocuments

        if (null != doc.getUpdateId()) { // update case...
            doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good)

            // (doc.index is populated but may not be correct because of the "many geos" workaround):
            if (DocumentPojoIndexMap.hasManyGeos(doc)) {
                // (note this check isn't stateless, it actually populates "locs" at the same time
                //  this is handled in addToDatastore (update case), temp removed when adding to DB
            } //TESTED (2.1.2, diagnostic mode, doc2)
        } else { // Not an update case, we're going to have to grab the document after all, which is a bit slower
            needToFindAndModify = true;
    } //TESTED (2.1.2, diagnostic mode, doc2)
    if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore
        if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc)
            doc.setCreated(new Date(doc.getUpdateId().getTime()));
        } //TESTED (2.1.2, diagnostic mode, doc2)               
    // (if we're here and index is not set, then it is intended to be null)

    // 4] Update the doc_metadata collection

    BasicDBObject softDelete = getSoftDeleteUpdate();
    BasicDBObject deadDoc = null; // (not normally needed)

    if (needToFindAndModify) { // less pleasant, need to go grab the doc
        deadDoc = (BasicDBObject) col.findOne(query, fields);
    } //TESTED (2.1.2)

    if (!_diagnosticMode) {
        col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons)         
    } //TESTED (2.1.2)

    // 5] Add fields if necessary

    if (null != deadDoc) {
        doc.setCreated((Date) deadDoc.get(DocumentPojo.created_));
        // (if getting this doc anyway then might as well get the created)
        doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_));
        doc.setIndex((String) deadDoc.get(DocumentPojo.index_));

        if (_diagnosticMode) {
                    .println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString());
    } //TESTED (2.1.2)
    else if (_diagnosticMode) {
        if (!needToFindAndModify) {
            System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted "
                    + doc.toDb().toString());
        } else {
                    "StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString());
    } //TESTED (2.1.2)

From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection eventFeatureDB = DbManager.getFeature().getAssociation();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 


    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex("association_index");

    // Create the index if necessary
    String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping,
            localSettings);//ww  w .ja v a 2  s. c o m

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex("association_index");

    // Now query the DB:

    DBCursor dbc = null;
    dbc = eventFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...

    List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>();

    int nSynced = 0;

    // Loop over array and invoke the cleansing function for each one
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class);

        // If this table has just been rebuilt from the document then the indexes are all wrong ...
        // recalculate and save
        if ('#' == evt.getIndex().charAt(0)) {
            AssociationPojo singleEvt = new AssociationPojo();
                    .update(new BasicDBObject("_id", dbo.get("_id")),
                            new BasicDBObject(MongoDbManager.set_,
                                    new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())),
                            false, true);
            // (has to be a multi-update even though it's unique because it's sharded on index)

        // Handle groups (system group is: "4c927585d591d31d7b37097a")
        if (null == evt.getCommunityId()) {
            evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));
        // Bulk add prep

        if (events.size() > 1000) {
                    AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null,
    // End loop over entities

    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(),
            new AssociationFeaturePojoIndexMap()), "_id", null, true);

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk)
        throws IOException {
    PropertiesManager pm = new PropertiesManager();
    int nMaxContentSize_bytes = pm.getMaxContentSize();

    // Initialize the DB:

    DBCollection docsDB = DbManager.getDocument().getMetadata();
    DBCollection contentDB = DbManager.getDocument().getContent();
    DBCollection sourcesDB = DbManager.getIngest().getSource();


    // 1. Get the documents from the DB (combining data + metadata and refreshing source meta)

    // (Ignore soft-deleted records:)
    if (null == query) {
        query = new BasicDBObject();
    }/*from ww  w .  j  a v  a 2 s.  c o  m*/
    Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException(
                    "Can't specify sourceKey as part of complex query term: " + query.toString());
        } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        if (sourceKeyQueryTerm instanceof String) {
                    SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm));
        } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }")
        else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution
            BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
            fields.put(SourcePojo.highestDistributionFactorStored_, 1);
            DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields);
            LinkedList<String> sourceKeys = new LinkedList<String>();
            for (DBObject dbo : dbc) {
                String key = (String) dbo.get(SourcePojo.key_);
                Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor);
            query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
        } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }")
        else {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //(actually not possible, just included here for mathematical completeness...)         
    } else {
        if (query.toString()
                .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) {
            throw new RuntimeException("Can't specify sourceKey as part of complex query term");
        } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }")

        // Optimize communityId into sourceKeys...
        if (null != query.get(DocumentPojo.communityId_)) {
            try {
                ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
                BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1);
                fields.put(SourcePojo.highestDistributionFactorStored_, 1);
                DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields);
                LinkedList<String> sourceKeys = new LinkedList<String>();
                int added = 0;
                for (DBObject dbo : dbc) {
                    String key = (String) dbo.get(SourcePojo.key_);
                    Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                    Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key,
                    added += sourceKeysForSource.size();
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));

                System.out.println("(Optimized simple community query to " + added + " source key(s))");
            } catch (Exception e) {

                System.out.println("(Can't optimize complex community query: " + e.getMessage());
        } //TESTED (by hand - including distributed source version)
    // Ignored delete objects
    Object urlQuery = query.get(DocumentPojo.url_);
    if (null == urlQuery) {
        query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
    } //TESTED
    else if (urlQuery instanceof BasicDBObject) {
        ((BasicDBObject) urlQuery).append("$regex", "^[^?]");
    } //TESTED
      //System.out.println("COMBINED QUERY= " + query.toString());

    // If aggregating, kick off the background aggregation thread
    if (bAggregate) {

    DBCursor dbc = null;
    dbc = docsDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...

    byte[] storageArray = new byte[200000];

    int nSynced = 0;
    LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
    Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
    ObjectId currCommunityId = null;
    while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject) dbc.next();
        DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        String sDocIndex = doc.getIndex();
        if (null == sDocIndex) {
            sDocIndex = "document_index";
        if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
            try { // (Just in case the index requires some time to sort itself out)
            } catch (InterruptedException e) {

        //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());

        // Get the content:
        if ((0 != nMaxContentSize_bytes)
                && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) {
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
            fields.put(CompressedFullTextPojo.sourceKey_, 1);

            DBCursor dbcGzip = contentDB.find(contentQ, fields);
            while (dbcGzip.hasNext()) {
                BasicDBObject dboContent = (BasicDBObject) dbcGzip.next();
                if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) {
                    // If this has another version then ignore this one...
                    if (dbc.hasNext()) {
                    } //TESTED (by hand)               

                byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = 0;
                StringBuffer output = new StringBuffer();
                while (nRead >= 0) {
                    nRead = gzip.read(storageArray, 0, 200000);
                    if (nRead > 0) {
                        String s = new String(storageArray, 0, nRead, "UTF-8");
        // (else document has full text already)

        // Get tags, if necessary:
        // Always overwrite tags - one of the reasons we might choose to migrate
        // Also may need source in order to support source index filtering
        SourcePojo src = _sourceCache.get(doc.getSourceKey());
        if (null == src) {
            //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
            BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                    .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
            if (null != srcDbo) {
                src = SourcePojo.fromDb(srcDbo, SourcePojo.class);

                if (null != src.getProcessingPipeline()) {
                    try {
                        // Set the index settings
                        HarvestController hc = new HarvestController();
                        HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
                        hcPipe.extractSource_preProcessingPipeline(src, hc);
                    } catch (Exception e) {
                } //TESTED (by hand)

                _sourceCache.put(doc.getSourceKey(), src);
        doc.setTempSource(src); // (needed for source index filtering)
        if (null != src) {
            if (null != src.getTags()) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();

                // May also want to write this back to the DB:
                //TODO (INF-2223): Handle append tags or not in the pipeline...
                if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
                    if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
                        BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                                doc.getRawSourceKey()); // (ie including the # if there is one)
                        updateQuery.put(DocumentPojo._id_, doc.getId());
                                new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_,
                                        new BasicDBObject(DbManager.each_, tagsTidied))));
                    doc.setTags(tagsTidied); // (just copy ptr across)

        // 2. Update the index with the new document            

        // (Optionally also update entity and assoc features)

        if (bAggregate) {
            if (null == currCommunityId) {
                currCommunityId = doc.getCommunityId();
            } else if (!currCommunityId.equals(doc.getCommunityId())) {
                LinkedList<DocumentPojo> perCommunityDocList = null;
                if (null == communityList) { // (very first time we see > 1 community)
                    communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
                    communityList.put(currCommunityId, perCommunityDocList);
                currCommunityId = doc.getCommunityId();
                perCommunityDocList = communityList.get(currCommunityId);
                if (null == perCommunityDocList) {
                    perCommunityDocList = new LinkedList<DocumentPojo>();
                    communityList.put(currCommunityId, perCommunityDocList);
        } //TESTED

        if (0 == (nSynced % 10000)) {
            StoreAndIndexManager manager = new StoreAndIndexManager();

            if (bAggregate) {
                // Loop over communities and aggregate each one then store the modified entities/assocs               
                doAggregation(communityList, docsToTransfer);
                communityList = null; // (in case the next 10,000 docs are all in the same community!)
                currCommunityId = null;

            } //TOTEST            

            System.out.println("(Synced " + nSynced + " records)");

    } // (End loop over docs)

    // Sync remaining docs

    if (!docsToTransfer.isEmpty()) {
        if (bAggregate) {
            // Loop over communities and aggregate each one then store the modified entities/assocs               
            doAggregation(communityList, docsToTransfer);

        StoreAndIndexManager manager = new StoreAndIndexManager();

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");

    if (bAggregate) {
        System.out.println("Completed. You can hit CTRL+C at any time.");
                "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
        try {
        } catch (InterruptedException e) {

        // Turn off so we can exit

From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java

License:Apache License

private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort,
        BasicDBObject query, int nLimit) {
    ElasticSearchManager elasticManager = null;

    try {/*from ww  w  . jav a2  s.  co m*/
        // Initialize the DB:

        DBCollection feedsDB = DbManager.getDocument().getMetadata();
        DBCollection contentDB = DbManager.getDocument().getContent();
        DBCollection sourcesDB = DbManager.getIngest().getSource();

        String indexName = "document_index";

        // Test/debug recreate the index
        if (true) {

            // (delete the index)
            System.out.println("Deleting index...");
            elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
            //(also deletes the child index - same index, different type)

            // Create the index if necessary
            String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(),

            Builder localSettings = ImmutableSettings.settingsBuilder();
            localSettings.put("number_of_shards", 10).put("number_of_replicas", 2);

            System.out.println("Creating index..." + sMapping);
            elasticManager = ElasticSearchManager.createIndex(indexName, null, false,
                    sElasticHost + ":" + sElasticPort, sMapping, localSettings);

        // Get the index (necessary if already created)
        if (null == elasticManager) {
            elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);

        // Get the feeds from the DB:

        //         System.out.println("Querying DB...");

        DBCursor dbc = feedsDB.find(query).limit(nLimit);

        byte[] storageArray = new byte[200000];

        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);

            System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl());

            // Get the content:
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
            BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ);
            if (null != dboContent) {
                byte[] compressedData = ((byte[]) dboContent.get("gzip_content"));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = gzip.read(storageArray, 0, 200000);
                String s = new String(storageArray, 0, nRead, "UTF-8");
            // Get tag:
            SourcePojo src = _sourceCache.get(doc.getSourceKey());
            if (null == src) {
                BasicDBObject srcDbo = (BasicDBObject) sourcesDB
                        .findOne(new BasicDBObject("key", doc.getSourceKey()));
                if (null != srcDbo) {
                    src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class);

                    _sourceCache.put(doc.getSourceKey(), src);
            if (null != src) {
                Set<String> tagsTidied = new TreeSet<String>();
                for (String s : src.getTags()) {
                    String ss = s.trim().toLowerCase();

            //TEST: set dynamic field
            // Lots of testing of dynamic dates:
            //            feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString()));
            //            String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated());            
            //            feed.addToMetadata("another_dateISO", s1);
            //            String s1_5 = new SimpleDateFormat().format(feed.getCreated());
            //            feed.addToMetadata("another_dateTimeJava", s1_5);
            //            String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated());            
            //            feed.addToMetadata("another_dateYYYYMMDD", s2);
            //            String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated());
            //            feed.addToMetadata("another_dateRFC822", s3);
            //            feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString());
            //            // Testing of the string field types
            //            feed.addToMetadata("my_comment", "Testing this ABCDEFG");            
            //            feed.addToMetadata("my_term", "Testing this UVWXYZ");
            //            feed.addToMetadata("my_text", "Testing this 123456");            
            //            // Test an array of longs:
            //            Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L;
            //            feed.addToMetadata("md_long", tl);

            //TEST: some dummy event timestamp adding code (not seeing much/any in the data)
            //            if (null != feed.getEvents()) {
            //               int i = 0;
            //               for (EventPojo evt: feed.getEvents()) {
            //                  //1: Add single date
            //                  if (0 == i) {
            //                     evt.time_start = "2011-01-01";
            //                  }
            //                  //2: Add short span
            //                  if (1 == i) {
            //                     evt.time_start = "2010-04-06";
            //                     evt.time_end = "2010-08-09";
            //                  }
            //                  //3: Add cross-yr span
            //                  if (2 == i) {
            //                     evt.time_start = "2012-06-05";
            //                     evt.time_end = "2013-09-05";
            //                  }
            //                  //4: Add too long span
            //                  if (3 == i) {
            //                     evt.time_start = "2012-04-06";
            //                     evt.time_end = "2014-04-09";
            //                  }
            //                  i++;
            //               }
            //            }

            // For event adding, see data_model.test.TestCode
    } catch (IOException e) {
    } finally {
        //nothing to do

From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java

License:Apache License

private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) {
    ElasticSearchManager elasticManager = null;

    // Initialize the DB:
    DBCollection entityFeatureDB = DbManager.getFeature().getEntity();

    // Initialize the ES (create the index if it doesn't already):

    // 1. Set-up the entity feature index 

    String indexName = "entity_index";

    // (delete the index)
    //elasticManager = ElasticSearchManager.getIndex(indexName);

    // Create the index if necessary
    String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
    Builder localSettings = ImmutableSettings.settingsBuilder();
    localSettings.put("number_of_shards", 1).put("number_of_replicas", 0);
    localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
    localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

    elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings);

    // Get the index (necessary if already created)
    if (null == elasticManager) {
        elasticManager = ElasticSearchManager.getIndex(indexName);
    }//from w w w.  j  av  a2  s. co  m

    // Now query the DB:

    DBCursor dbc = null;
    dbc = entityFeatureDB.find(query);
    if (null != chunk) {
        if (chunk.containsField(DbManager.min_)) {
            dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
        if (chunk.containsField(DbManager.max_)) {
            dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
    dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
    if (null == chunk) {
        int nCount = dbc.count() - nSkip;
        if (nCount < 0)
            nCount = 0;
                "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit));
        if (0 == nCount) { // Nothing to do...

    int nSynced = 0;

    List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>();
    while (dbc.hasNext()) {
        EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class);

        if (null != feature.getAlias()) { // (some corrupt gazateer entry)

            // Handle groups (system group is: "4c927585d591d31d7b37097a")
            // if there is no community id, add system group (something is wrong if this happens?)
            if (null == feature.getCommunityId()) {
                feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a"));


        // Add the entities
        if (entities.size() > 1000) {
            elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
                    new EntityFeaturePojoIndexMap()), "_id", null, true);
            // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

            entities = new ArrayList<EntityFeaturePojo>();
    //write whatevers left
    elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(),
            new EntityFeaturePojoIndexMap()), "_id", null, true);
    // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community)

    if (null != chunk) {
        System.out.println("Found " + nSynced + " records to sync in chunk");