com.ikanow.aleph2.analytics.spark.services.TestEnrichmentPipelineService.java Source code

Introduction

Here is the source code for com.ikanow.aleph2.analytics.spark.services.TestEnrichmentPipelineService.java
Source

/*******************************************************************************
 * Copyright 2016, The IKANOW Open Source Project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package com.ikanow.aleph2.analytics.spark.services;

import static org.junit.Assert.*;

import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;

import scala.Tuple2;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableMap;
import com.ikanow.aleph2.core.shared.utils.BatchRecordUtils;
import com.ikanow.aleph2.data_model.interfaces.data_analytics.IAnalyticsContext;
import com.ikanow.aleph2.data_model.interfaces.data_analytics.IBatchRecord;
import com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule;
import com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IBucketLogger;
import com.ikanow.aleph2.data_model.interfaces.shared_services.ICrudService;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IServiceContext;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IUnderlyingService;
import com.ikanow.aleph2.data_model.objects.data_analytics.AnalyticThreadJobBean;
import com.ikanow.aleph2.data_model.objects.data_import.AnnotationBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataBucketStatusBean;
import com.ikanow.aleph2.data_model.objects.data_import.EnrichmentControlMetadataBean;
import com.ikanow.aleph2.data_model.objects.shared.AssetStateDirectoryBean.StateDirectoryType;
import com.ikanow.aleph2.data_model.objects.shared.BasicMessageBean;
import com.ikanow.aleph2.data_model.objects.shared.SharedLibraryBean;
import com.ikanow.aleph2.data_model.utils.BeanTemplateUtils;
import com.ikanow.aleph2.data_model.utils.ErrorUtils;
import com.ikanow.aleph2.data_model.utils.Optionals;
import com.ikanow.aleph2.data_model.utils.Tuples;

import fj.data.Either;
import fj.data.Validation;

/**
 * @author Alex
 *
 */
public class TestEnrichmentPipelineService {
    final static ObjectMapper _mapper = BeanTemplateUtils.configureMapper(Optional.empty());

    static JavaSparkContext _spark;

    @Before
    public void setup() {
        if (null != _spark) {
            return;
        }
        _spark = new JavaSparkContext("local", "TestEnrichmentPipelineService");
    }

    @Test
    public void test_groupingBehavior() {

        //(quickly test the whole thing works!)
        {
            JavaRDD<String> test = _spark.parallelize(Arrays.asList("a", "b", "c"));
            assertEquals(3L, test.map(s -> s + "X").count());
        }

        // (sample group)
        {
            JavaRDD<Tuple2<String, String>> test = _spark.parallelize(
                    Arrays.asList(Tuples._2T("a", "resa1"), Tuples._2T("b", "resb1"), Tuples._2T("a", "resa2")));
            assertEquals(2L, test.groupBy(t2 -> t2._1()).map(key_lt2 -> {
                System.out.println("key=" + key_lt2._1() + ".. vals = " + Optionals.streamOf(key_lt2._2(), false)
                        .map(t2 -> t2.toString()).collect(Collectors.joining(";")));
                return null;
            }).count());
        }

    }

    //////////////////////////////////////////////////////////////

    @Test
    public void test_inMapPartitions() {

        final DataBucketBean test_bucket = BeanTemplateUtils.build(DataBucketBean.class)
                .with(DataBucketBean::full_name, "/test").done().get();

        final IAnalyticsContext mock_analytics_context = Mockito.mock(IAnalyticsContext.class,
                Mockito.withSettings().serializable());
        // ^ (can't include IAnalyticsContext in here, which is going to make life unpleasant)
        Mockito.when(mock_analytics_context.getUnderlyingPlatformDriver(Mockito.eq(IEnrichmentModuleContext.class),
                Mockito.any())).thenAnswer(new ContextAnswer());
        Mockito.when(mock_analytics_context.getLibraryConfigs()).thenAnswer(new LibraryConfigsAnswer());
        Mockito.when(mock_analytics_context.getBucket()).thenAnswer(new BucketAnswer(test_bucket));

        // Empty set should fail:
        {
            final List<EnrichmentControlMetadataBean> pipeline_elements = Arrays.asList();

            try {
                @SuppressWarnings("unused")
                final EnrichmentPipelineService under_test = EnrichmentPipelineService
                        .create(mock_analytics_context, false, pipeline_elements);
                fail("Should error");
            } catch (Exception e) {
                // Good
            }
        }
        // Now try with a non trivial set of enrichment elements
        {
            final List<EnrichmentControlMetadataBean> pipeline_elements = Arrays.asList(
                    BeanTemplateUtils.build(EnrichmentControlMetadataBean.class)
                            .with(EnrichmentControlMetadataBean::name, "test1")
                            .with(EnrichmentControlMetadataBean::entry_point, TestEnrichmentModule.class.getName())
                            .with(EnrichmentControlMetadataBean::technology_override,
                                    new LinkedHashMap<String, Object>(ImmutableMap.of("batch_size", "10")))
                            .with(EnrichmentControlMetadataBean::config,
                                    new LinkedHashMap<String, Object>(ImmutableMap.of("append_field", "test1_field",
                                            "stop_field", "test1_stop")))
                            .done().get(),
                    BeanTemplateUtils.build(EnrichmentControlMetadataBean.class)
                            .with(EnrichmentControlMetadataBean::name, "test2")
                            .with(EnrichmentControlMetadataBean::entry_point, TestEnrichmentModule.class.getName())
                            .with(EnrichmentControlMetadataBean::technology_override,
                                    new LinkedHashMap<String, Object>(ImmutableMap.of("batch_size", 7)))
                            .with(EnrichmentControlMetadataBean::config,
                                    new LinkedHashMap<String, Object>(ImmutableMap.of("append_field", "test2_field",
                                            "stop_field", "test2_stop")))
                            .done().get(),
                    BeanTemplateUtils.build(EnrichmentControlMetadataBean.class)
                            .with(EnrichmentControlMetadataBean::name, "test3")
                            .with(EnrichmentControlMetadataBean::entry_point, TestEnrichmentModule.class.getName())
                            .with(EnrichmentControlMetadataBean::technology_override,
                                    new LinkedHashMap<String, Object>(ImmutableMap.of("batch_size", 20L)))
                            .with(EnrichmentControlMetadataBean::config, new LinkedHashMap<String, Object>(
                                    ImmutableMap.of("append_field", "test3_field", "stop_field", "test3_stop")))
                            .done().get());

            final AnalyticThreadJobBean job = BeanTemplateUtils.build(AnalyticThreadJobBean.class)
                    .with(AnalyticThreadJobBean::config,
                            new LinkedHashMap<>(ImmutableMap.of(EnrichmentControlMetadataBean.ENRICHMENT_PIPELINE,
                                    pipeline_elements.stream().map(b -> BeanTemplateUtils.toMap(b))
                                            .collect(Collectors.toList()))))
                    .done().get();
            Mockito.when(mock_analytics_context.getJob()).thenAnswer(new JobAnswer(job));

            // Actual test:
            {
                final EnrichmentPipelineService under_test = EnrichmentPipelineService
                        .create(mock_analytics_context, false, pipeline_elements);

                // Check it's all serializable:
                try {
                    new ObjectOutputStream(new ByteArrayOutputStream()).writeObject(under_test);
                    new ObjectOutputStream(new ByteArrayOutputStream()).writeObject(under_test.inMapPartitions());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPreGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPreGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPrePostGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPrePostGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPostGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitions());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPreGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPreGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPrePostGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPrePostGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPostGroup());
                } catch (Throwable t) {
                    t.printStackTrace();
                    fail(ErrorUtils.getLongForm("{0}", t));
                }

                JavaRDD<Tuple2<Long, IBatchRecord>> test = _spark.parallelize(Arrays
                        .asList(_mapper.createObjectNode().put("id", "1"),
                                _mapper.createObjectNode().put("id", "2").put("test1_stop", true),
                                _mapper.createObjectNode().put("id", "3"),
                                _mapper.createObjectNode().put("id", "4"),
                                _mapper.createObjectNode().put("id", "5").put("test2_stop", true),
                                _mapper.createObjectNode().put("id", "6"),
                                _mapper.createObjectNode().put("id", "7"),
                                _mapper.createObjectNode().put("id", "8"),
                                _mapper.createObjectNode().put("id", "9").put("test3_stop", true),
                                _mapper.createObjectNode().put("id", "10"))
                        .stream()
                        .<Tuple2<Long, IBatchRecord>>map(
                                j -> Tuples._2T(0L, new BatchRecordUtils.JsonBatchRecord((JsonNode) j)))
                        .collect(Collectors.toList()));

                JavaRDD<Tuple2<Long, IBatchRecord>> out = test.mapPartitions(under_test.javaInMapPartitions());
                assertEquals(10, out.count());
                assertEquals(7, out.filter(t2 -> t2._2().getJson().has("test1_field")).count()); // 7 objects that pass through and aren't filtered by any stages (not the appended 1)
                assertEquals(8, out.filter(t2 -> t2._2().getJson().has("test2_field")).count()); // 8 objects that pass through (7+1 from parent) and .. (not the appended 1)
                assertEquals(9, out.filter(t2 -> t2._2().getJson().has("test3_field")).count()); // 9 objects that pass through (8+1 from parent) and .. (not the appended 1)
                assertEquals(3, out.filter(t2 -> t2._2().getJson().has("appended")).count());

                // Scala version:
                final EnrichmentPipelineService scala_under_test = EnrichmentPipelineService
                        .create(mock_analytics_context, false, pipeline_elements);
                assertEquals(10, test.rdd().mapPartitions(scala_under_test.inMapPartitions(), true,
                        scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class)).count());

            }
            // Test with pre-group
            {
                final EnrichmentPipelineService under_test = EnrichmentPipelineService
                        .select(mock_analytics_context, false, "test1", "test2", "test3");

                // Check it's serializable:
                try {
                    new ObjectOutputStream(new ByteArrayOutputStream()).writeObject(under_test);
                    new ObjectOutputStream(new ByteArrayOutputStream()).writeObject(under_test.inMapPartitions());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPreGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPreGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPrePostGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPrePostGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.inMapPartitionsPostGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitions());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPreGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPreGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPrePostGroup());
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPrePostGroup(Collections.emptyList()));
                    new ObjectOutputStream(new ByteArrayOutputStream())
                            .writeObject(under_test.javaInMapPartitionsPostGroup());
                } catch (Throwable t) {
                    t.printStackTrace();
                    fail(ErrorUtils.getLongForm("{0}", t));
                }

                JavaRDD<Tuple2<Long, IBatchRecord>> test = _spark.parallelize(Arrays
                        .asList(_mapper.createObjectNode().put("id", "1").put("grouper", "A"),
                                _mapper.createObjectNode().put("id", "2").put("test1_stop", true),
                                _mapper.createObjectNode().put("id", "3").put("grouper", "A"),
                                _mapper.createObjectNode().put("id", "4").put("grouper", "B"),
                                _mapper.createObjectNode().put("id", "5").put("test2_stop", true),
                                _mapper.createObjectNode().put("id", "6").put("grouper", "B"),
                                _mapper.createObjectNode().put("id", "7").put("grouper", "C"),
                                _mapper.createObjectNode().put("id", "8").put("grouper", "C"),
                                _mapper.createObjectNode().put("id", "9").put("test3_stop", true).put("grouper",
                                        "D"), // (wont' get promoted)
                                _mapper.createObjectNode().put("id", "10").put("grouper", "D"))
                        .stream()
                        .<Tuple2<Long, IBatchRecord>>map(
                                j -> Tuples._2T(0L, new BatchRecordUtils.JsonBatchRecord((JsonNode) j)))
                        .collect(Collectors.toList()));

                JavaRDD<Tuple2<IBatchRecord, Tuple2<Long, IBatchRecord>>> out = JavaRDD.fromRDD(
                        test.rdd().mapPartitions(under_test.inMapPartitionsPreGroup("grouper"), true,
                                scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class)),
                        scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class));

                //TRACE
                //out.foreach(f -> System.out.println(f + " ... " + f._1()._2().getJson()));

                assertEquals(10, out.count());
                assertEquals(7, out.filter(t2 -> t2._2()._2().getJson().has("test1_field")).count()); // 7 objects that pass through and aren't filtered by any stages (not the appended 1)
                assertEquals(8, out.filter(t2 -> t2._2()._2().getJson().has("test2_field")).count()); // 8 objects that pass through (7+1 from parent) and .. (not the appended 1)
                assertEquals(9, out.filter(t2 -> t2._2()._2().getJson().has("test3_field")).count()); // 9 objects that pass through (8+1 from parent) and .. (not the appended 1)
                assertEquals(3, out.filter(t2 -> t2._2()._2().getJson().has("appended")).count());
                assertEquals(2, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("A")).orElse(false)).count());
                assertEquals(2, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("B")).orElse(false)).count());
                assertEquals(2, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("C")).orElse(false)).count());
                assertEquals(1, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("D")).orElse(false)).count());
                assertEquals(3, out.filter(t2 -> 0 == t2._1().getJson().size()).count()); // (the injected records)

                // Other scala version/test can reuse the function. ie it's immutable:
                assertEquals(10,
                        test.rdd().mapPartitions(under_test.inMapPartitionsPreGroup(Arrays.asList("group")), true,
                                scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class)).count());
            }
            // Test with post group (no cloing of test_1 enricher) ... java version
            {
                final EnrichmentPipelineService under_test = EnrichmentPipelineService
                        .create(mock_analytics_context, false, pipeline_elements);

                JavaRDD<Tuple2<IBatchRecord, Iterable<Tuple2<Long, IBatchRecord>>>> test_java = EnrichmentPipelineService
                        .javaGroupOf(_spark.parallelize(Arrays
                                .asList(_mapper.createObjectNode().put("id", "1").put("grouper", "A"),
                                        _mapper.createObjectNode().put("id", "2").put("test1_stop", true)
                                                .put("grouper", "A"),
                                        _mapper.createObjectNode().put("id", "3").put("grouper", "A"),
                                        _mapper.createObjectNode().put("id", "4").put("grouper", "B"),
                                        _mapper.createObjectNode().put("id", "5").put("test2_stop", true)
                                                .put("grouper", "B"),
                                        _mapper.createObjectNode().put("id", "6").put("grouper", "B"),
                                        _mapper.createObjectNode().put("id", "7").put("grouper", "C"),
                                        _mapper.createObjectNode().put("id", "8").put("grouper", "C"),
                                        _mapper.createObjectNode().put("id", "9").put("test3_stop", true)
                                                .put("grouper", "D"), // (wont' get promoted)
                                        _mapper.createObjectNode().put("id", "10").put("grouper", "D"))
                                .stream()
                                .<Tuple2<Long, IBatchRecord>>map(
                                        j -> Tuples._2T(0L, new BatchRecordUtils.JsonBatchRecord((JsonNode) j)))
                                .map(t2 -> Tuples._2T((IBatchRecord) new BatchRecordUtils.JsonBatchRecord(
                                        t2._2().getJson().get("grouper")), t2))
                                .collect(Collectors.toList())));

                RDD<Tuple2<IBatchRecord, Iterable<Tuple2<Long, IBatchRecord>>>> test = test_java.rdd();

                assertEquals(4, test.count());

                JavaRDD<Tuple2<Long, IBatchRecord>> out = JavaRDD.fromRDD(
                        test.mapPartitions(under_test.inMapPartitionsPostGroup(), true,
                                scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class)),
                        scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class));

                assertEquals(10, out.count());
                // (the other counts have been performed sufficiently many times for code coverage safety at this point)

                //(can uncomment to check compiles ... but is called by scala so coverage is fine)
                //test_java.mapPartitions(under_test.javaInMapPartitionsPostGroup());

            }
            // As above but with clone mode enabled (plus use direct grouping in scala for coverage purposes)
            {
                final List<EnrichmentControlMetadataBean> cloning_pipeline_elements = Arrays
                        .asList(BeanTemplateUtils.build(EnrichmentControlMetadataBean.class)
                                .with(EnrichmentControlMetadataBean::name, "test1")
                                .with(EnrichmentControlMetadataBean::entry_point,
                                        TestEnrichmentModule.class.getName())
                                .with(EnrichmentControlMetadataBean::technology_override,
                                        new LinkedHashMap<String, Object>(ImmutableMap.of("batch_size", "10")))
                                .with(EnrichmentControlMetadataBean::config,
                                        new LinkedHashMap<String, Object>(ImmutableMap.of("append_field",
                                                "test1_field", "stop_field", "test1_stop", "clone_mode", true)))
                                .done().get(), pipeline_elements.get(1), pipeline_elements.get(2));

                final EnrichmentPipelineService under_test = EnrichmentPipelineService
                        .create(mock_analytics_context, false, cloning_pipeline_elements);

                RDD<Tuple2<IBatchRecord, Tuple2<Long, IBatchRecord>>> test = _spark.parallelize(Arrays
                        .asList(_mapper.createObjectNode().put("id", "1").put("grouper", "A"),
                                _mapper.createObjectNode().put("id", "2").put("test1_stop", true).put("grouper",
                                        "A"),
                                _mapper.createObjectNode().put("id", "3").put("grouper", "A"),
                                _mapper.createObjectNode().put("id", "4").put("grouper", "B"),
                                _mapper.createObjectNode().put("id", "5").put("test2_stop", true).put("grouper",
                                        "B"),
                                _mapper.createObjectNode().put("id", "6").put("grouper", "B"),
                                _mapper.createObjectNode().put("id", "7").put("grouper", "C"),
                                _mapper.createObjectNode().put("id", "8").put("grouper", "C"),
                                _mapper.createObjectNode().put("id", "9").put("test3_stop", true).put("grouper",
                                        "D"), // (wont' get promoted)
                                _mapper.createObjectNode().put("id", "10").put("grouper", "D"))
                        .stream()
                        .<Tuple2<Long, IBatchRecord>>map(
                                j -> Tuples._2T(0L, new BatchRecordUtils.JsonBatchRecord((JsonNode) j)))
                        .map(t2 -> Tuples._2T((IBatchRecord) new BatchRecordUtils.JsonBatchRecord(
                                t2._2().getJson().get("grouper")), t2))
                        .collect(Collectors.toList())).rdd();

                assertEquals(10, test.count()); //(pre grouping)

                JavaRDD<Tuple2<Long, IBatchRecord>> out = JavaRDD.fromRDD(
                        EnrichmentPipelineService.groupOf(test).mapPartitions(under_test.inMapPartitionsPostGroup(),
                                true, scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class)),
                        scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class));

                assertEquals(14, out.count());
                // (the other counts have been performed sufficiently many times for code coverage safety at this point)
            }
            // Finally, a combined pre-post group
            {
                final EnrichmentPipelineService under_test = EnrichmentPipelineService
                        .create(mock_analytics_context, false, pipeline_elements);

                RDD<Tuple2<IBatchRecord, Iterable<Tuple2<Long, IBatchRecord>>>> test = _spark
                        .parallelize(Arrays
                                .asList(_mapper.createObjectNode().put("id", "1").put("grouper", "A"),
                                        _mapper.createObjectNode().put("id", "2").put("test1_stop", true)
                                                .put("grouper", "A"),
                                        _mapper.createObjectNode().put("id", "3").put("grouper", "A"),
                                        _mapper.createObjectNode().put("id", "4").put("grouper", "B"),
                                        _mapper.createObjectNode().put("id", "5").put("test2_stop", true)
                                                .put("grouper", "B"),
                                        _mapper.createObjectNode().put("id", "6").put("grouper", "B"),
                                        _mapper.createObjectNode().put("id", "7").put("grouper", "C"),
                                        _mapper.createObjectNode().put("id", "8").put("grouper", "C"),
                                        _mapper.createObjectNode().put("id", "9").put("test3_stop", true)
                                                .put("grouper", "D"), // (wont' get promoted)
                                        _mapper.createObjectNode().put("id", "10").put("grouper", "D"))
                                .stream()
                                .<Tuple2<Long, IBatchRecord>>map(
                                        j -> Tuples._2T(0L, new BatchRecordUtils.JsonBatchRecord((JsonNode) j)))
                                .collect(Collectors.toList()))
                        .groupBy(t2 -> (IBatchRecord) new BatchRecordUtils.JsonBatchRecord(
                                t2._2().getJson().get("grouper")))
                        .rdd();

                JavaRDD<Tuple2<IBatchRecord, Tuple2<Long, IBatchRecord>>> out = JavaRDD.fromRDD(
                        test.mapPartitions(under_test.inMapPartitionsPrePostGroup("grouper"), true,
                                scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class)),
                        scala.reflect.ClassTag$.MODULE$.apply(Tuple2.class));

                //TRACE
                //out.foreach(f -> System.out.println(f + " ... " + f._1()._2().getJson()));

                assertEquals(10, out.count());
                assertEquals(7, out.filter(t2 -> t2._2()._2().getJson().has("test1_field")).count()); // 7 objects that pass through and aren't filtered by any stages (not the appended 1)
                assertEquals(8, out.filter(t2 -> t2._2()._2().getJson().has("test2_field")).count()); // 8 objects that pass through (7+1 from parent) and .. (not the appended 1)
                assertEquals(9, out.filter(t2 -> t2._2()._2().getJson().has("test3_field")).count()); // 9 objects that pass through (8+1 from parent) and .. (not the appended 1)
                assertEquals(3, out.filter(t2 -> t2._2()._2().getJson().has("appended")).count());
                assertEquals(2, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("A")).orElse(false)).count());
                assertEquals(2, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("B")).orElse(false)).count());
                assertEquals(2, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("C")).orElse(false)).count());
                assertEquals(1, out.filter(t2 -> Optional.ofNullable(t2._1().getJson().get("grouper"))
                        .map(j -> j.asText()).map(s -> s.equals("D")).orElse(false)).count());
                assertEquals(3, out.filter(t2 -> 0 == t2._1().getJson().size()).count()); // (the injected records)
            }
        }

    }

    //////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////

    // Some enrichment services:

    public static class TestEnrichmentModule implements IEnrichmentBatchModule {

        Optional<List<String>> _next_grouping_fields;
        String _append_field;
        String _stop_field;
        String _name;
        IEnrichmentModuleContext _context;
        boolean _clone_mode;

        /** System c'tor
         */
        public TestEnrichmentModule() {
        }

        /** Copy c'tor
         * @param copy
         */
        protected TestEnrichmentModule(TestEnrichmentModule copy) {
            _append_field = copy._append_field;
            _stop_field = copy._stop_field;
            _context = copy._context;
            _next_grouping_fields = copy._next_grouping_fields;
            _name = copy._name;
            _clone_mode = copy._clone_mode;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#onStageInitialize(com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext, com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean, com.ikanow.aleph2.data_model.objects.data_import.EnrichmentControlMetadataBean, scala.Tuple2, java.util.Optional)
         */
        @Override
        public void onStageInitialize(IEnrichmentModuleContext context, DataBucketBean bucket,
                EnrichmentControlMetadataBean control, Tuple2<ProcessingStage, ProcessingStage> previous_next,
                Optional<List<String>> next_grouping_fields) {

            _append_field = Optional.ofNullable(control.config().get("append_field")).orElse("").toString();
            _stop_field = Optional.ofNullable(control.config().get("stop_field")).orElse("").toString();
            _context = context;
            _next_grouping_fields = next_grouping_fields;
            _name = control.name();
            _clone_mode = Optional.ofNullable(control.config().get("clone_mode")).map(b -> (Boolean) b)
                    .orElse(false);
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#onObjectBatch(java.util.stream.Stream, java.util.Optional, java.util.Optional)
         */
        @Override
        public void onObjectBatch(Stream<Tuple2<Long, IBatchRecord>> batch, Optional<Integer> batch_size,
                Optional<JsonNode> grouping_key) {

            batch.forEach(t2 -> {
                final ObjectNode j = (ObjectNode) t2._2().getJson();
                if (!j.has(_stop_field)) {
                    j.put(_append_field, "test");
                    _context.emitMutableObject(t2._1(), j, Optional.empty(),
                            _next_grouping_fields
                                    .map(ff -> ff.stream().reduce(_mapper.createObjectNode(),
                                            (acc, v) -> (ObjectNode) Optional.ofNullable(j.get(v))
                                                    .map(jj -> acc.set(v, jj)).orElse(acc),
                                            (j1, j2) -> j1))
                                    .map(o -> (JsonNode) o).filter(o -> o.size() > 0));
                }
            });
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#onStageComplete(boolean)
         */
        @Override
        public void onStageComplete(boolean is_original) {
            // Send one extra object
            final ObjectNode o = _mapper.createObjectNode().put("appended", _name);
            _context.emitMutableObject(1000, o, Optional.empty(), Optional.empty());
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#cloneForNewGrouping()
         */
        @Override
        public IEnrichmentBatchModule cloneForNewGrouping() {
            if (_clone_mode) {
                return new TestEnrichmentModule(this);
            } else {
                return this;
            }
        }

    }

    ////////////////////////////////////////////////////

    public static class TestEnrichmentContext implements IEnrichmentModuleContext {
        final LinkedList<Tuple2<Tuple2<Long, IBatchRecord>, Optional<JsonNode>>> _l = new LinkedList<>();

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IUnderlyingService#getUnderlyingArtefacts()
         */
        @Override
        public Collection<Object> getUnderlyingArtefacts() {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IUnderlyingService#getUnderlyingPlatformDriver(java.lang.Class, java.util.Optional)
         */
        @SuppressWarnings("unchecked")
        @Override
        public <T> Optional<T> getUnderlyingPlatformDriver(Class<T> driver_class, Optional<String> driver_options) {

            if (List.class.isAssignableFrom(driver_class))
                return (Optional<T>) Optional.of(_l);
            else
                return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getEnrichmentContextSignature(java.util.Optional, java.util.Optional)
         */
        @Override
        public String getEnrichmentContextSignature(Optional<DataBucketBean> bucket,
                Optional<Set<Tuple2<Class<? extends IUnderlyingService>, Optional<String>>>> services) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getTopologyEntryPoints(java.lang.Class, java.util.Optional)
         */
        @Override
        public <T> Collection<Tuple2<T, String>> getTopologyEntryPoints(Class<T> clazz,
                Optional<DataBucketBean> bucket) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getTopologyStorageEndpoint(java.lang.Class, java.util.Optional)
         */
        @Override
        public <T> T getTopologyStorageEndpoint(Class<T> clazz, Optional<DataBucketBean> bucket) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getTopologyErrorEndpoint(java.lang.Class, java.util.Optional)
         */
        @Override
        public <T> T getTopologyErrorEndpoint(Class<T> clazz, Optional<DataBucketBean> bucket) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getNextUnusedId()
         */
        @Override
        public long getNextUnusedId() {

            return 0;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#convertToMutable(com.fasterxml.jackson.databind.JsonNode)
         */
        @Override
        public ObjectNode convertToMutable(JsonNode original) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#emitMutableObject(long, com.fasterxml.jackson.databind.node.ObjectNode, java.util.Optional, java.util.Optional)
         */
        @Override
        public Validation<BasicMessageBean, JsonNode> emitMutableObject(long id, ObjectNode mutated_json,
                Optional<AnnotationBean> annotations, Optional<JsonNode> grouping_key) {

            _l.add(Tuples._2T(Tuples._2T(id, new BatchRecordUtils.JsonBatchRecord(mutated_json)), grouping_key));

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#emitImmutableObject(long, com.fasterxml.jackson.databind.JsonNode, java.util.Optional, java.util.Optional, java.util.Optional)
         */
        @Override
        public Validation<BasicMessageBean, JsonNode> emitImmutableObject(long id, JsonNode original_json,
                Optional<ObjectNode> mutations, Optional<AnnotationBean> annotations,
                Optional<JsonNode> grouping_key) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#storeErroredObject(long, com.fasterxml.jackson.databind.JsonNode)
         */
        @Override
        public void storeErroredObject(long id, JsonNode original_json) {

        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#externalEmit(com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean, fj.data.Either, java.util.Optional)
         */
        @Override
        public Validation<BasicMessageBean, JsonNode> externalEmit(DataBucketBean bucket,
                Either<JsonNode, Map<String, Object>> object, Optional<AnnotationBean> annotations) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#flushBatchOutput(java.util.Optional)
         */
        @Override
        public CompletableFuture<?> flushBatchOutput(Optional<DataBucketBean> bucket) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getServiceContext()
         */
        @Override
        public IServiceContext getServiceContext() {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getGlobalEnrichmentModuleObjectStore(java.lang.Class, java.util.Optional)
         */
        @Override
        public <S> Optional<ICrudService<S>> getGlobalEnrichmentModuleObjectStore(Class<S> clazz,
                Optional<String> collection) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getBucketObjectStore(java.lang.Class, java.util.Optional, java.util.Optional, java.util.Optional)
         */
        @Override
        public <S> ICrudService<S> getBucketObjectStore(Class<S> clazz, Optional<DataBucketBean> bucket,
                Optional<String> collection, Optional<StateDirectoryType> type) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getBucket()
         */
        @Override
        public Optional<DataBucketBean> getBucket() {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getModuleConfig()
         */
        @Override
        public Optional<SharedLibraryBean> getModuleConfig() {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getBucketStatus(java.util.Optional)
         */
        @Override
        public Future<DataBucketStatusBean> getBucketStatus(Optional<DataBucketBean> bucket) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#getLogger(java.util.Optional)
         */
        @Override
        public IBucketLogger getLogger(Optional<DataBucketBean> bucket) {

            return null;
        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#emergencyDisableBucket(java.util.Optional)
         */
        @Override
        public void emergencyDisableBucket(Optional<DataBucketBean> bucket) {

        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#emergencyQuarantineBucket(java.util.Optional, java.lang.String)
         */
        @Override
        public void emergencyQuarantineBucket(Optional<DataBucketBean> bucket, String quarantine_duration) {

        }

        /* (non-Javadoc)
         * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext#initializeNewContext(java.lang.String)
         */
        @Override
        public void initializeNewContext(String signature) {

        }

    }

    ////////////////////////////////////////////////////

    // Analytics context

    @SuppressWarnings("rawtypes")
    public static class JobAnswer implements Answer, Serializable {
        private static final long serialVersionUID = -3489108224499090639L;

        final AnalyticThreadJobBean _job;

        JobAnswer(final AnalyticThreadJobBean job) {
            _job = job;
        }

        /* (non-Javadoc)
         * @see org.mockito.stubbing.Answer#answer(org.mockito.invocation.InvocationOnMock)
         */
        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            return Optional.of(_job);
        }

    }

    @SuppressWarnings("rawtypes")
    public static class ContextAnswer implements Answer, Serializable {
        private static final long serialVersionUID = -3489108224499090639L;

        /* (non-Javadoc)
         * @see org.mockito.stubbing.Answer#answer(org.mockito.invocation.InvocationOnMock)
         */
        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            return Optional.of(new TestEnrichmentContext());
        }

    }

    @SuppressWarnings("rawtypes")
    public static class LibraryConfigsAnswer implements Answer, Serializable {
        private static final long serialVersionUID = -2951282076341800662L;

        /* (non-Javadoc)
         * @see org.mockito.stubbing.Answer#answer(org.mockito.invocation.InvocationOnMock)
         */
        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            return Collections.emptyMap();
        }

    }

    @SuppressWarnings("rawtypes")
    public static class BucketAnswer implements Answer, Serializable {
        private static final long serialVersionUID = -7059333322269908505L;

        final DataBucketBean _bucket;

        BucketAnswer(final DataBucketBean bucket) {
            _bucket = bucket;
        }

        /* (non-Javadoc)
         * @see org.mockito.stubbing.Answer#answer(org.mockito.invocation.InvocationOnMock)
         */
        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            return Optional.of(_bucket);
        }

    }

}