com.ikanow.aleph2.analytics.services.DeduplicationEnrichmentContext.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.aleph2.analytics.services.DeduplicationEnrichmentContext.java

Source

/*******************************************************************************
 * Copyright 2016, The IKANOW Open Source Project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package com.ikanow.aleph2.analytics.services;

import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import scala.Tuple2;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.ikanow.aleph2.analytics.utils.ErrorUtils;
import com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IBucketLogger;
import com.ikanow.aleph2.data_model.interfaces.shared_services.ICrudService;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IServiceContext;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IUnderlyingService;
import com.ikanow.aleph2.data_model.objects.data_import.AnnotationBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataBucketStatusBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean.DocumentSchemaBean.CustomPolicy;
import com.ikanow.aleph2.data_model.objects.shared.AssetStateDirectoryBean.StateDirectoryType;
import com.ikanow.aleph2.data_model.objects.shared.BasicMessageBean;
import com.ikanow.aleph2.data_model.objects.shared.SharedLibraryBean;

import fj.data.Either;
import fj.data.Validation;

/** A wrapper for an enrichment context called as part of a merge between new and existing objects
 * @author Alex
 *
 */
public class DeduplicationEnrichmentContext implements IEnrichmentModuleContext {
    protected final IEnrichmentModuleContext _delegate;

    //////////////////////////////////////////////////////////////////////

    // INITIALIZATION

    protected final DataSchemaBean.DocumentSchemaBean _config;

    protected static class MutableState {
        Set<JsonNode> _id_set;
        Optional<JsonNode> _grouping_key;
        int _num_emitted = 0;
        LinkedList<JsonNode> _manual_ids_to_delete = new LinkedList<>();
    }

    protected final MutableState _mutable_state = new MutableState();
    protected final CustomPolicy _custom_policy;
    protected final boolean _delete_unhandled_duplicates;
    protected final Function<JsonNode, Optional<JsonNode>> _json_to_grouping_key;

    protected static final BasicMessageBean _ERROR_BEAN = ErrorUtils.buildErrorMessage(
            DeduplicationEnrichmentContext.class.getSimpleName(), "checkObjectLogic",
            ErrorUtils.BREAK_DEDUPLICATION_POLICY);

    /** User c'tor
     * @param delegate
     * @param id_duplicate_map
     * @param grouping_key
     * @param config
     */
    public DeduplicationEnrichmentContext(final IEnrichmentModuleContext delegate,
            final DataSchemaBean.DocumentSchemaBean config,
            final Function<JsonNode, Optional<JsonNode>> json_to_grouping_key) {
        _delegate = delegate;
        _config = config;
        _custom_policy = Optional.ofNullable(_config.custom_policy()).orElse(CustomPolicy.strict);
        _delete_unhandled_duplicates = Optional.ofNullable(_config.delete_unhandled_duplicates()).orElse(false);
        _json_to_grouping_key = json_to_grouping_key;
    }

    /** Called for every new/existing merge
     * @param id_duplicate_map
     * @param grouping_key
     */
    public void resetMutableState(final Collection<JsonNode> dups, final JsonNode grouping_key) {
        _mutable_state._id_set = dups.stream().map(j -> j.get(AnnotationBean._ID)).filter(id -> null != id)
                .collect(Collectors.toSet());

        _mutable_state._grouping_key = _json_to_grouping_key.apply(grouping_key);
        _mutable_state._num_emitted = 0;
        _mutable_state._manual_ids_to_delete = new LinkedList<JsonNode>();
    }

    //////////////////////////////////////////////////////////////////////

    // EMIT LOGIC

    /** Will error out based on the mode
    // Options:
    // - if in "strict" mode (default) then only allow emitting an object without a matching _id if it has a different grouping key
    //   (ensures don't generate more duplicates)
    // - if in "very strict" mode, then only allow one emission per call
    // Separately:
    // - if "custom_delete_unhandled_duplicates" is true (false by default in "strict" mode, true by default in "very_strict" mode) then
    //   will issue deletes to any objects that aren't emitted by the user   
     * @param to_emit
     * @return
     */
    protected Validation<BasicMessageBean, JsonNode> checkObjectLogic(final JsonNode to_emit) {
        if (CustomPolicy.strict == _custom_policy) {
            final JsonNode _id = to_emit.get(AnnotationBean._ID);
            if (!_mutable_state._id_set.contains(_id)) { // (even if null) if "an object without a matching _id"            
                if (_json_to_grouping_key.apply(to_emit).equals(_mutable_state._grouping_key)) { // only .. if it has a different grouping key
                    return Validation.fail(_ERROR_BEAN);
                }
            }
        } else if (CustomPolicy.very_strict == _custom_policy) {
            if (_mutable_state._num_emitted > 0) {
                return Validation.fail(_ERROR_BEAN);
            }
        }
        return Validation.success(to_emit);
    }

    /** Common emit logic
     * @param json
     * @param supplier
     * @return
     */
    public Validation<BasicMessageBean, JsonNode> emitObjectCommon(final JsonNode json,
            final Supplier<Validation<BasicMessageBean, JsonNode>> supplier) {
        return checkObjectLogic(json).bind(j -> {
            final boolean is_manual_delete = (1 == json.size());
            final JsonNode _id = (_delete_unhandled_duplicates || is_manual_delete) ? json.get(AnnotationBean._ID)
                    : null;
            if (is_manual_delete && (null != _id)) {
                _mutable_state._manual_ids_to_delete.add(_id);
                return Validation.success(json);
            } else {
                if (_delete_unhandled_duplicates || (CustomPolicy.very_strict == _custom_policy))
                    _mutable_state._id_set.remove(_id);
                _mutable_state._num_emitted++;
                return supplier.get();
            }
        });

    }

    @Override
    public Validation<BasicMessageBean, JsonNode> emitMutableObject(long id, ObjectNode mutated_json,
            Optional<AnnotationBean> annotations, Optional<JsonNode> grouping_key) {
        return emitObjectCommon(mutated_json,
                () -> _delegate.emitMutableObject(id, mutated_json, annotations, grouping_key));
    }

    @Override
    public Validation<BasicMessageBean, JsonNode> emitImmutableObject(long id, JsonNode original_json,
            Optional<ObjectNode> mutations, Optional<AnnotationBean> annotations, Optional<JsonNode> grouping_key) {
        return emitObjectCommon(original_json,
                () -> _delegate.emitImmutableObject(id, original_json, mutations, annotations, grouping_key));
    }

    // (EXTERNAL EMIT HAS NO RESTRICTIONS)

    @Override
    public Validation<BasicMessageBean, JsonNode> externalEmit(DataBucketBean bucket,
            Either<JsonNode, Map<String, Object>> object, Optional<AnnotationBean> annotations) {
        return _delegate.externalEmit(bucket, object, annotations);
    }

    //////////////////////////////////////////////////////////////////////

    // FINALIZATION

    /** A stream of object ids that should be deleted
     * @return
     */
    public Stream<JsonNode> getObjectIdsToDelete() {

        // Get a list of _ids that haven't been emitted
        final Collection<JsonNode> auto_delete = _delete_unhandled_duplicates ? _mutable_state._id_set
                : Collections.emptyList();

        return Stream.concat(_mutable_state._manual_ids_to_delete.stream(), auto_delete.stream());
    }

    //////////////////////////////////////////////////////////////////////

    // LEAVE ALL THESE UNCHANGED

    @Override
    public Collection<Object> getUnderlyingArtefacts() {
        return _delegate.getUnderlyingArtefacts();
    }

    @Override
    public <T> Optional<T> getUnderlyingPlatformDriver(Class<T> driver_class, Optional<String> driver_options) {
        return _delegate.getUnderlyingPlatformDriver(driver_class, driver_options);
    }

    @Override
    public String getEnrichmentContextSignature(Optional<DataBucketBean> bucket,
            Optional<Set<Tuple2<Class<? extends IUnderlyingService>, Optional<String>>>> services) {
        return _delegate.getEnrichmentContextSignature(bucket, services);
    }

    @Override
    public <T> Collection<Tuple2<T, String>> getTopologyEntryPoints(Class<T> clazz,
            Optional<DataBucketBean> bucket) {
        return _delegate.getTopologyEntryPoints(clazz, bucket);
    }

    @Override
    public <T> T getTopologyStorageEndpoint(Class<T> clazz, Optional<DataBucketBean> bucket) {
        return _delegate.getTopologyStorageEndpoint(clazz, bucket);
    }

    @Override
    public <T> T getTopologyErrorEndpoint(Class<T> clazz, Optional<DataBucketBean> bucket) {
        return _delegate.getTopologyErrorEndpoint(clazz, bucket);
    }

    @Override
    public long getNextUnusedId() {
        return _delegate.getNextUnusedId();
    }

    @Override
    public ObjectNode convertToMutable(JsonNode original) {
        return _delegate.convertToMutable(original);
    }

    @Override
    public void storeErroredObject(long id, JsonNode original_json) {
        _delegate.storeErroredObject(id, original_json);
    }

    @Override
    public CompletableFuture<?> flushBatchOutput(Optional<DataBucketBean> bucket) {
        return _delegate.flushBatchOutput(bucket);
    }

    @Override
    public IServiceContext getServiceContext() {
        return _delegate.getServiceContext();
    }

    @Override
    public <S> Optional<ICrudService<S>> getGlobalEnrichmentModuleObjectStore(Class<S> clazz,
            Optional<String> collection) {
        return _delegate.getGlobalEnrichmentModuleObjectStore(clazz, collection);
    }

    @Override
    public <S> ICrudService<S> getBucketObjectStore(Class<S> clazz, Optional<DataBucketBean> bucket,
            Optional<String> collection, Optional<StateDirectoryType> type) {
        return _delegate.getBucketObjectStore(clazz, bucket, collection, type);
    }

    @Override
    public Optional<DataBucketBean> getBucket() {
        return _delegate.getBucket();
    }

    @Override
    public Optional<SharedLibraryBean> getModuleConfig() {
        return _delegate.getModuleConfig();
    }

    @Override
    public Future<DataBucketStatusBean> getBucketStatus(Optional<DataBucketBean> bucket) {
        return _delegate.getBucketStatus(bucket);
    }

    @Override
    public void emergencyDisableBucket(Optional<DataBucketBean> bucket) {
        _delegate.emergencyDisableBucket(bucket);
    }

    @Override
    public void emergencyQuarantineBucket(Optional<DataBucketBean> bucket, String quarantine_duration) {
        _delegate.emergencyQuarantineBucket(bucket, quarantine_duration);
    }

    @Override
    public void initializeNewContext(String signature) {
        _delegate.initializeNewContext(signature);
    }

    @Override
    public IBucketLogger getLogger(Optional<DataBucketBean> bucket) {
        return _delegate.getLogger(bucket);
    }
}