com.thinkbiganalytics.spark.datavalidator.functions.CleanseAndValidateRow.java Source code

Java tutorial

Introduction

Here is the source code for com.thinkbiganalytics.spark.datavalidator.functions.CleanseAndValidateRow.java

Source

package com.thinkbiganalytics.spark.datavalidator.functions;

/*-
 * #%L
 * kylo-spark-validate-cleanse-core
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.Iterables;
import com.thinkbiganalytics.policy.BaseFieldPolicy;
import com.thinkbiganalytics.policy.FieldPolicy;
import com.thinkbiganalytics.policy.standardization.AcceptsEmptyValues;
import com.thinkbiganalytics.policy.standardization.StandardizationPolicy;
import com.thinkbiganalytics.policy.validation.NotNullValidator;
import com.thinkbiganalytics.policy.validation.ValidationPolicy;
import com.thinkbiganalytics.policy.validation.ValidationResult;
import com.thinkbiganalytics.spark.datavalidator.CleansedRowResult;
import com.thinkbiganalytics.spark.datavalidator.StandardizationAndValidationResult;
import com.thinkbiganalytics.spark.util.InvalidFormatException;
import com.thinkbiganalytics.spark.validation.HCatDataType;

import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.ParameterizedType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import static com.thinkbiganalytics.spark.datavalidator.StandardDataValidator.VALID_RESULT;

/**
 * Spark function to perform both cleansing and validation of a data row based on data policies and the target datatype
 */
public class CleanseAndValidateRow implements Function<Row, CleansedRowResult> {

    /**
     * Name of the processing date/time column.
     */
    public static final String PROCESSING_DTTM_COL = "processing_dttm";

    /**
     * Name of the column containing the rejection reason.
     */
    public static final String REJECT_REASON_COL = "dlp_reject_reason";

    private static final Logger log = LoggerFactory.getLogger(CleanseAndValidateRow.class);
    private static final long serialVersionUID = 5092972811157021179L;

    /**
     * Target data types
     */
    @Nonnull
    private final HCatDataType[] dataTypes;

    /**
     * Indicates if the data contains a processing date/time column.
     */
    private final boolean hasProcessingDttm;

    @Nonnull
    private final FieldPolicy[] policies;

    /**
     * Schema of output rows
     */
    @Nonnull
    private final StructType schema;

    public CleanseAndValidateRow(@Nonnull final FieldPolicy[] policies, @Nonnull final StructField[] fields) {
        this.policies = policies;
        hasProcessingDttm = Iterables.any(Arrays.asList(fields), new Predicate<StructField>() {
            @Override
            public boolean apply(@Nullable StructField input) {
                return input != null && input.name().equals(PROCESSING_DTTM_COL);
            }
        });

        dataTypes = resolveDataTypes(fields);
        schema = getSchema(fields);
    }

    @Override
    public CleansedRowResult call(@Nonnull final Row row) throws Exception {
        /*
        Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
        */
        Map<Class, Class> validatorParamType = new HashMap<>();

        int nulls = hasProcessingDttm ? 1 : 0;

        // Create placeholder for the new values plus one columns for reject_reason
        Object[] newValues = new Object[dataTypes.length + 1];
        boolean rowValid = true;
        String sbRejectReason;
        List<ValidationResult> results = null;
        boolean[] columnsValid = new boolean[dataTypes.length];

        Map<Integer, Object> originalValues = new HashMap<>();

        // Iterate through columns to cleanse and validate
        for (int idx = 0; idx < dataTypes.length; idx++) {
            ValidationResult result;
            FieldPolicy fieldPolicy = policies[idx];
            HCatDataType dataType = dataTypes[idx];
            boolean columnValid = true;
            boolean isBinaryType = dataType.getConvertibleType().equals(byte[].class);

            // Extract the value (allowing for null or missing field for odd-ball data)
            Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
            // Handle complex types by passing them through

            if (dataType.isUnchecked()) {
                if (val == null) {
                    nulls++;
                }
                newValues[idx] = val;
                originalValues.put(idx, val);
            } else {
                Object fieldValue = (val);
                boolean isEmpty;

                if (fieldValue == null) {
                    nulls++;
                }
                originalValues.put(idx, fieldValue);

                StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(
                        fieldPolicy, fieldValue, dataType, validatorParamType);
                result = standardizationAndValidationResult.getFinalValidationResult();

                //only apply the standardized result value if the routine is valid
                fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;

                //reevaluate the isEmpty flag
                isEmpty = ((fieldValue == null) || (StringUtils.isEmpty(fieldValue.toString())));

                //if the field is a binary type, but cant be converted set it to null.
                //hive will auto convert byte[] or String fields to a target binary type.
                if (result.isValid() && isBinaryType && !(fieldValue instanceof byte[])
                        && !(fieldValue instanceof String)) {
                    //set it to null
                    fieldValue = null;
                } else if ((dataType.isNumeric() || isBinaryType) && isEmpty) {
                    //if its a numeric column and the field is empty then set it to null as well
                    fieldValue = null;
                }
                newValues[idx] = fieldValue;

                if (!result.isValid()) {
                    rowValid = false;
                    results = (results == null ? new Vector<ValidationResult>() : results);
                    results.addAll(standardizationAndValidationResult.getValidationResults());
                    columnValid = false;
                }

            }

            // Record fact that we there was an invalid column
            columnsValid[idx] = columnValid;
        }
        // Return success unless all values were null.  That would indicate a blank line in the file.
        if (nulls >= dataTypes.length) {
            rowValid = false;
            results = (results == null ? new Vector<ValidationResult>() : results);
            results.add(ValidationResult.failRow("empty", "Row is empty"));
        }

        if (!rowValid) {
            for (int idx = 0; idx < dataTypes.length; idx++) {
                //if the value is not able to match the invalid dataTypes and the datatype has changed then replace with original value
                //the _invalid table dataTypes matches the source, not the destination
                if (newValues[idx] == null || originalValues.get(idx) == null
                        || newValues[idx].getClass() != originalValues.get(idx).getClass()) {
                    newValues[idx] = originalValues.get(idx);
                }
                //otherwise the data has changed, but its still the same data type so we can keep the newly changed value

            }
        }

        // Convert to reject reasons to JSON
        sbRejectReason = toJSONArray(results);

        // Record the results in the appended columns, move processing partition value last
        if (hasProcessingDttm) {
            newValues[dataTypes.length] = newValues[dataTypes.length - 1]; //PROCESSING_DTTM_COL
            newValues[dataTypes.length - 1] = sbRejectReason; //REJECT_REASON_COL
        } else {
            newValues[dataTypes.length] = sbRejectReason;
        }

        return new CleansedRowResult(RowFactory.create(newValues), columnsValid, rowValid);
    }

    /**
     * Gets the schema for the output rows.
     */
    @Nonnull
    public StructType getSchema() {
        return schema;
    }

    StandardizationAndValidationResult standardizeAndValidateField(FieldPolicy fieldPolicy, Object value,
            HCatDataType dataType, Map<Class, Class> validatorParamType) {
        StandardizationAndValidationResult result = new StandardizationAndValidationResult(value);

        List<BaseFieldPolicy> fieldPolicies = fieldPolicy.getAllPolicies();

        int standardizerCount = 0;
        for (BaseFieldPolicy p : fieldPolicies) {
            if (p instanceof StandardizationPolicy) {
                standardizerCount++;
            }
        }
        boolean validateNullValues = false;
        int processedStandardizers = 0;
        for (BaseFieldPolicy p : fieldPolicies) {
            boolean isEmpty = ((result.getFieldValue() == null)
                    || (StringUtils.isEmpty(result.getFieldValue().toString())));
            if (p instanceof StandardizationPolicy) {
                processedStandardizers++;
                StandardizationPolicy standardizationPolicy = (StandardizationPolicy) p;
                boolean shouldStandardize = true;
                if (isEmpty && !(standardizationPolicy instanceof AcceptsEmptyValues)) {
                    shouldStandardize = false;
                }

                if (!standardizationPolicy.accepts(result.getFieldValue())) {
                    shouldStandardize = false;
                }

                if (shouldStandardize) {
                    Object newValue = standardizationPolicy.convertRawValue(result.getFieldValue());

                    //If this is the last standardizer for this field and the standardized value is returned as a String, and target column is not String, then validate and convert it to correct type
                    if (newValue != null && dataType.getConvertibleType() != newValue.getClass()
                            && standardizerCount == processedStandardizers) {
                        try {
                            //Date and timestamp fields can be valid as strings
                            boolean isValueOk = dataType.isStringValueValidForHiveType(newValue.toString());
                            if (!isValueOk) {
                                //if the current string is not in a correct format attempt to convert it
                                try {
                                    newValue = dataType.toNativeValue(newValue.toString());
                                } catch (RuntimeException e) {
                                    result.addValidationResult(ValidationResult.failField("incompatible",
                                            dataType.getName(), "Not convertible to " + dataType.getNativeType()));
                                }
                            }
                        } catch (InvalidFormatException e) {
                            log.warn("Could not convert value {} to correct type {}", newValue.toString(),
                                    dataType.getConvertibleType().getName());
                        }
                    }
                    result.setFieldValue(newValue);
                }
            }

            if (p instanceof ValidationPolicy) {
                ValidationPolicy validationPolicy = (ValidationPolicy) p;
                //run through the validator if the value is not null, or if we are allowed to validate nulls based upon a previous
                //not null validator
                if (!isEmpty || validateNullValues || validationPolicy instanceof NotNullValidator) {
                    ValidationResult validationResult = validateValue(validationPolicy, dataType,
                            result.getFieldValue(), validatorParamType);
                    if (isEmpty && validationPolicy instanceof NotNullValidator) {
                        validateNullValues = validationPolicy != VALID_RESULT;
                    }
                    //only need to add those that are invalid
                    if (validationResult != VALID_RESULT) {
                        result.addValidationResult(validationResult);
                        break; //exit out of processing if invalid records found.
                    }
                }
                //reset the failOnEmpty flag back to false
                if (!(validationPolicy instanceof NotNullValidator)) {
                    validateNullValues = false;
                }
            }

        }
        ValidationResult finalValidationCheck = finalValidationCheck(fieldPolicy, dataType, result.getFieldValue());
        if (finalValidationCheck != VALID_RESULT) {
            result.addValidationResult(finalValidationCheck);
        }

        return result;
    }

    /**
     * Perform validation using both dataTypes validation the validation policies
     */
    private ValidationResult finalValidationCheck(FieldPolicy fieldPolicy, HCatDataType fieldDataType,
            Object fieldValue) {

        boolean isEmpty = ((fieldValue instanceof String && StringUtils.isEmpty((String) fieldValue))
                || fieldValue == null);
        if (!isEmpty && !fieldPolicy.shouldSkipSchemaValidation()
                && !fieldDataType.isValueConvertibleToType(fieldValue)) {
            return ValidationResult.failField("incompatible", fieldDataType.getName(),
                    "Not convertible to " + fieldDataType.getNativeType());
        }

        return VALID_RESULT;
    }

    /**
     * List of all the feedFieldNames that are part of the policyMap
     */
    @Nonnull
    private List<String> getPolicyMapFeedFieldNames() {
        return FluentIterable.from(Arrays.asList(policies)).filter(new Predicate<FieldPolicy>() {
            @Override
            public boolean apply(@Nullable FieldPolicy input) {
                return (input != null && input.getFeedField() != null);
            }
        }).transform(new com.google.common.base.Function<FieldPolicy, String>() {
            @Override
            public String apply(@Nullable FieldPolicy input) {
                assert input != null;
                return input.getFeedField().toLowerCase();
            }
        }).toList();
    }

    /**
     * Gets the output schema for the specified target fields.
     */
    @Nonnull
    private StructType getSchema(@Nonnull final StructField[] feedFields) {
        final List<StructField> feedFieldsList = new ArrayList<>(feedFields.length);
        final List<String> policyMapFeedFieldNames = getPolicyMapFeedFieldNames();

        for (final StructField feedField : feedFields) {
            final String lowerFieldName = feedField.name().toLowerCase();
            if (policyMapFeedFieldNames.contains(lowerFieldName)) {
                if (!feedField.name().equalsIgnoreCase(PROCESSING_DTTM_COL)
                        && !feedField.name().equalsIgnoreCase(REJECT_REASON_COL)) {
                    feedFieldsList.add(feedField);
                }
            } else {
                log.warn("Feed table field {} is not present in policy map", lowerFieldName);
            }

        }

        // Insert the two custom fields before the processing partition column
        if (hasProcessingDttm) {
            feedFieldsList.add(new StructField(PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
            feedFieldsList.add(feedFieldsList.size() - 1,
                    new StructField(REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
        } else {
            feedFieldsList.add(new StructField(REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
        }

        return new StructType(feedFieldsList.toArray(new StructField[0]));
    }

    /**
     * Converts the table schema into the corresponding data type structures
     */
    @Nonnull
    private HCatDataType[] resolveDataTypes(StructField[] fields) {
        List<HCatDataType> cols = new ArrayList<>(fields.length);

        for (StructField field : fields) {
            String colName = field.name();
            String dataType = field.dataType().simpleString();
            cols.add(HCatDataType.createFromDataType(colName, dataType));
        }
        return cols.toArray(new HCatDataType[0]);
    }

    /* Resolve the type of param required by the validator. A cache is used to avoid cost of reflection */
    private Class resolveValidatorParamType(ValidationPolicy validator, Map<Class, Class> validatorParamType) {
        Class expectedParamClazz = validatorParamType.get(validator.getClass());
        if (expectedParamClazz == null) {
            // Cache for future references

            Object t = validator.getClass().getGenericInterfaces()[0];
            if (t instanceof ParameterizedType) {
                ParameterizedType type = (ParameterizedType) t;
                expectedParamClazz = (Class) type.getActualTypeArguments()[0];
            } else {
                expectedParamClazz = String.class;
            }
            validatorParamType.put(validator.getClass(), expectedParamClazz);
        }
        return expectedParamClazz;
    }

    private String toJSONArray(List<ValidationResult> results) {
        // Convert to reject reasons to JSON
        StringBuilder sb = null;
        if (results != null) {
            sb = new StringBuilder();
            for (ValidationResult result : results) {
                if (sb.length() > 0) {
                    sb.append(",");
                } else {
                    sb.append("[");
                }
                sb.append(result.toJSON());
            }
            sb.append("]");
        }
        return (sb == null ? "" : sb.toString());
    }

    private ValidationResult validateValue(ValidationPolicy validator, HCatDataType fieldDataType,
            Object fieldValue, Map<Class, Class> validatorParamType) {
        try {
            // Resolve the type of parameter required by the validator. A cache is used to avoid cost of reflection.
            Class expectedParamClazz = resolveValidatorParamType(validator, validatorParamType);
            Object nativeValue = fieldValue;
            if (expectedParamClazz != String.class && fieldValue instanceof String) {
                nativeValue = fieldDataType.toNativeValue(fieldValue.toString());
            }
            if (!validator.validate(nativeValue)) {

                //get any fields in this validator annotated with PolicyProperty

                return ValidationResult.failFieldRule("rule", fieldDataType.getName(),
                        validator.getClass().getSimpleName(), "Rule violation");
            }
            return VALID_RESULT;
        } catch (InvalidFormatException | ClassCastException e) {
            return ValidationResult.failField("incompatible", fieldDataType.getName(),
                    "Not convertible to " + fieldDataType.getNativeType());
        }

    }
}