org.icgc.dcc.submission.dictionary.DictionaryValidator.java Source code

Java tutorial

Introduction

Here is the source code for org.icgc.dcc.submission.dictionary.DictionaryValidator.java

Source

/*
 * Copyright (c) 2013 The Ontario Institute for Cancer Research. All rights reserved.                             
 *                                                                                                               
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with                                  
 * this program. If not, see <http://www.gnu.org/licenses/>.                                                     
 *                                                                                                               
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY                           
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES                          
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT                           
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                                
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED                          
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;                               
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER                              
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN                         
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.submission.dictionary;

import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.Iterables.getFirst;
import static com.google.common.collect.Maps.newHashMap;
import static com.google.common.collect.Sets.difference;
import static com.google.common.collect.Sets.newCopyOnWriteArraySet;
import static com.google.common.collect.Sets.newHashSet;
import static com.google.common.collect.Sets.newLinkedHashSet;
import static org.apache.commons.lang.StringUtils.isBlank;
import static org.apache.commons.lang.StringUtils.split;
import static org.icgc.dcc.core.model.FieldNames.SubmissionFieldNames.SUBMISSION_DONOR_ID;
import static org.icgc.dcc.core.model.FieldNames.SubmissionFieldNames.SUBMISSION_OBSERVATION_CHROMOSOME;
import static org.icgc.dcc.core.model.FieldNames.SubmissionFieldNames.SUBMISSION_OBSERVATION_CHROMOSOME_END;
import static org.icgc.dcc.core.model.FieldNames.SubmissionFieldNames.SUBMISSION_OBSERVATION_CHROMOSOME_START;
import static org.icgc.dcc.core.model.FieldNames.SubmissionFieldNames.SUBMISSION_OBSERVATION_MUTATION_TYPE;

import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import lombok.Value;
import lombok.val;
import lombok.extern.slf4j.Slf4j;

import org.icgc.dcc.core.model.BusinessKeys;
import org.icgc.dcc.core.model.ValueType;
import org.icgc.dcc.submission.dictionary.model.CodeList;
import org.icgc.dcc.submission.dictionary.model.Dictionary;
import org.icgc.dcc.submission.dictionary.model.Field;
import org.icgc.dcc.submission.dictionary.model.FileSchema;
import org.icgc.dcc.submission.dictionary.model.Restriction;
import org.icgc.dcc.submission.dictionary.model.RestrictionType;
import org.icgc.dcc.submission.dictionary.model.SummaryType;
import org.icgc.dcc.submission.validation.primary.restriction.CodeListRestriction;
import org.icgc.dcc.submission.validation.primary.restriction.DiscreteValuesRestriction;
import org.icgc.dcc.submission.validation.primary.restriction.RangeFieldRestriction;
import org.icgc.dcc.submission.validation.primary.restriction.ScriptRestriction;
import org.icgc.dcc.submission.validation.primary.restriction.ScriptRestriction.InvalidScriptException;

import com.google.common.base.Function;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Multiset;
import com.google.common.collect.Sets.SetView;
import com.google.common.collect.Table;
import com.google.common.primitives.Doubles;
import com.google.common.primitives.Longs;

@Slf4j
public class DictionaryValidator {

    private final Dictionary dictionary;
    private final DictionaryIndex dictionaryIndex;
    private final CodeListIndex codeListIndex;

    public DictionaryValidator(Dictionary dictionary, Iterable<CodeList> codeLists) {
        this.dictionary = checkNotNull(dictionary);
        this.dictionaryIndex = new DictionaryIndex(dictionary);
        this.codeListIndex = new CodeListIndex(codeLists);
    }

    public DictionaryConstraintViolations validate() {
        Set<DictionaryConstraintViolation> errors = newLinkedHashSet();
        Set<DictionaryConstraintViolation> warnings = newLinkedHashSet();

        try {
            validateSchemata(errors, warnings);
            validateCodeLists(errors, warnings);
        } catch (Exception e) {
            log.error("Exception validating:", e);
            errors.add(new DictionaryConstraintViolation("Exception validating", e.getMessage()));
        }

        return new DictionaryConstraintViolations(warnings, errors);
    }

    private void validateSchemata(Set<DictionaryConstraintViolation> errors,
            Set<DictionaryConstraintViolation> warnings) {
        for (val schema : dictionary.getFiles()) {
            try {
                if (isBlank(schema.getPattern())) {
                    errors.add(new DictionaryConstraintViolation("Missing schema file pattern", schema.getName()));
                } else {
                    Pattern.compile(schema.getPattern());
                }
            } catch (PatternSyntaxException e) {
                errors.add(new DictionaryConstraintViolation("Invalid schema file pattern", schema.getName(),
                        schema.getPattern()));
            }

            validateFieldNames(errors, schema);
            validateFields(errors, warnings, schema);
            validateRelations(errors, schema);
        }

        validateBusinessKeys(errors, warnings);
    }

    private void validateFields(Set<DictionaryConstraintViolation> errors,
            Set<DictionaryConstraintViolation> warnings, FileSchema schema) {
        for (val field : schema.getFields()) {
            Set<RestrictionType> restrictionTypes = newCopyOnWriteArraySet(
                    dictionaryIndex.getRestrictionTypes(schema.getName(), field.getName()));
            restrictionTypes.remove(RestrictionType.REQUIRED);
            if (restrictionTypes.size() > 2) {
                errors.add(new DictionaryConstraintViolation("Incompatible field restrictions", schema.getName(),
                        field.getName(), restrictionTypes));
            }

            val summaryType = field.getSummaryType();
            if (summaryType == SummaryType.FREQUENCY && field.getValueType().isNumeric()) {
                warnings.add(new DictionaryConstraintViolation("Potentially large field summary value set", schema,
                        field, summaryType));
            }
            if (summaryType == SummaryType.AVERAGE && !field.getValueType().isNumeric()) {
                errors.add(new DictionaryConstraintViolation("Incompatible numeric field summary type", schema,
                        field, summaryType));
            }
            if (summaryType == SummaryType.FREQUENCY && schema.getUniqueFields().size() == 1
                    && schema.getUniqueFields().contains(field.getName())) {
                warnings.add(new DictionaryConstraintViolation("Frequency defined for unique field", schema, field,
                        summaryType, schema.getUniqueFields()));
            }

            validateRestrictions(errors, schema, field);
        }
    }

    private void validateRestrictions(Set<DictionaryConstraintViolation> errors, FileSchema schema, Field field) {
        for (val restriction : field.getRestrictions()) {
            val config = restriction.getConfig();
            if (restriction.getType() == null) {
                errors.add(new DictionaryConstraintViolation("Field restriction type is blank", schema, field,
                        restriction));
            }

            if (restriction.getType() == RestrictionType.CODELIST) {
                String codeListName = config.getString(CodeListRestriction.FIELD);
                if (isBlank(codeListName)) {
                    errors.add(new DictionaryConstraintViolation("Field code list name is blank", schema, field,
                            restriction));
                } else if (!codeListIndex.has(codeListName)) {
                    errors.add(new DictionaryConstraintViolation("Field invalid code list reference", schema, field,
                            restriction));
                }
            }

            if (restriction.getType() == RestrictionType.DISCRETE_VALUES) {
                String text = config.getString(DiscreteValuesRestriction.PARAM);
                String[] values = split(text, ",");
                for (val value : values) {
                    if (isBlank(value)) {
                        errors.add(new DictionaryConstraintViolation("Blank discrete value", schema, field,
                                restriction));
                        break;
                    }
                }
            }

            if (restriction.getType() == RestrictionType.RANGE) {
                String min = config.getString(RangeFieldRestriction.MIN);
                String max = config.getString(RangeFieldRestriction.MAX);
                if (!field.getValueType().isNumeric()) {
                    errors.add(new DictionaryConstraintViolation("Non-numeric range field value type", schema,
                            field, restriction, field.getValueType()));
                }
                if (field.getValueType() == ValueType.INTEGER && Longs.tryParse(min) == null) {
                    errors.add(new DictionaryConstraintViolation("Non INTEGER range min value", schema, field,
                            restriction, min));
                }
                if (field.getValueType() == ValueType.DECIMAL && Doubles.tryParse(min) == null) {
                    errors.add(new DictionaryConstraintViolation("Non DECIMAL range min value", schema, field,
                            restriction, min));
                }
                if (field.getValueType() == ValueType.INTEGER && Longs.tryParse(max) == null) {
                    errors.add(new DictionaryConstraintViolation("Non INTEGER range max value", schema, field,
                            restriction, max));
                }
                if (field.getValueType() == ValueType.DECIMAL && Doubles.tryParse(max) == null) {
                    errors.add(new DictionaryConstraintViolation("Non DECIMAL range max value", schema, field,
                            restriction, max));
                }
            }

            if (restriction.getType() == RestrictionType.SCRIPT) {
                val description = config.getString(ScriptRestriction.PARAM_DESCRIPTION);
                if (isBlank(description)) {
                    errors.add(new DictionaryConstraintViolation(
                            "Script restriction is missing description parameter", schema, field, restriction));
                }

                val script = config.getString(ScriptRestriction.PARAM);
                if (isBlank(script)) {
                    errors.add(new DictionaryConstraintViolation("Script restriction is missing script parameter",
                            schema, field, restriction));

                    continue;
                }

                try {
                    val scriptContext = new ScriptRestriction.ScriptContext(script);

                    val inputs = scriptContext.getInputs();
                    for (val inputName : inputs.keySet()) {
                        Field inputField = dictionaryIndex.getField(schema.getName(), inputName);
                        if (inputField == null) {
                            errors.add(new DictionaryConstraintViolation(
                                    "File schema is missing referenced script field", schema, field, restriction,
                                    script, inputName));

                            continue;
                        }
                    }
                } catch (InvalidScriptException e) {
                    errors.add(
                            new DictionaryConstraintViolation(e.getMessage(), schema, field, restriction, script));
                }
            }
        }
    }

    private void validateFieldNames(Set<DictionaryConstraintViolation> errors, FileSchema schema) {
        val fieldNames = HashMultiset.create(schema.getFieldNames());
        for (String fieldName : fieldNames) {
            if (fieldNames.count(fieldName) > 1) {
                errors.add(new DictionaryConstraintViolation("Duplicate field name", schema.getName(), fieldName));
            }
        }
    }

    private void validateRelations(Set<DictionaryConstraintViolation> errors, FileSchema schema) {
        for (val relation : schema.getRelations()) {
            if (relation.getFields().isEmpty()) {
                errors.add(
                        new DictionaryConstraintViolation("Missing schema fields for relation", schema, relation));
            }

            if (relation.getOtherFields().isEmpty()) {
                errors.add(new DictionaryConstraintViolation("Missing other schema fields for relation", schema,
                        relation));
            }

            if (relation.getFields().size() != relation.getOtherFields().size()) {
                errors.add(new DictionaryConstraintViolation(
                        "Mismatching field count between schema and other schema for relation", schema, relation,
                        relation.getFields().size(), relation.getOtherFields().size()));
            }

            for (val fieldName : relation.getFields()) {
                if (!dictionaryIndex.hasField(schema.getName(), fieldName)) {
                    errors.add(new DictionaryConstraintViolation("Invalid schema field for relation", schema,
                            relation, fieldName));
                }
            }
            if (!dictionaryIndex.hasSchema(relation.getOther())) {
                errors.add(new DictionaryConstraintViolation("Invalid other schema for relation", schema, relation,
                        relation.getOther()));
            }
            for (val otherFieldName : relation.getOtherFields()) {
                if (!dictionaryIndex.hasField(schema.getName(), otherFieldName)) {
                    errors.add(new DictionaryConstraintViolation("Invalid other schema field for relation", schema,
                            relation, otherFieldName));
                }
            }

            FileSchema otherSchema = dictionaryIndex.getSchema(relation.getOther());
            SetView<String> difference = difference(newHashSet(otherSchema.getUniqueFields()),
                    newHashSet(relation.getOtherFields()));
            if (!difference.isEmpty()) {
                errors.add(new DictionaryConstraintViolation("Other schema fields are not unique for relation",
                        schema, relation, difference));
            }
        }
    }

    private void validateBusinessKeys(Set<DictionaryConstraintViolation> errors,
            Set<DictionaryConstraintViolation> warnings) {
        FileSchema ssm_p = dictionaryIndex.getSchema("ssm_p");
        if (ssm_p == null) {
            errors.add(new DictionaryConstraintViolation(
                    "'ssm_p' schema is missing but is required for required business key field validation"));
        } else {
            // See BusinessKeys.MUTATION_KEY
            // TODO: Add the full set after the dictionary is created
            val relaxedMutationKey = ImmutableList.<String>of(SUBMISSION_OBSERVATION_CHROMOSOME,
                    SUBMISSION_OBSERVATION_CHROMOSOME_START, SUBMISSION_OBSERVATION_CHROMOSOME_END,
                    SUBMISSION_OBSERVATION_MUTATION_TYPE);

            for (val keyField : relaxedMutationKey) {
                val required = dictionaryIndex.hasRestrictionType(ssm_p.getName(), keyField,
                        RestrictionType.REQUIRED);
                if (!required) {
                    errors.add(new DictionaryConstraintViolation(
                            "'ssm_p' schema field is required for downstream processing", keyField,
                            BusinessKeys.MUTATION));
                }
            }
        }

        FileSchema donor = dictionaryIndex.getSchema("donor");
        if (donor == null) {
            errors.add(new DictionaryConstraintViolation(
                    "'donor' schema is missing but is required for required business key field validation"));
        } else {
            val keyField = SUBMISSION_DONOR_ID;
            val required = dictionaryIndex.hasRestrictionType(donor.getName(), keyField, RestrictionType.REQUIRED);
            if (!required) {
                errors.add(new DictionaryConstraintViolation(
                        "'donor' schema field is required for business key field", keyField));
            }
        }

        // TODO: Add validations for remaining business keys
    }

    private void validateCodeLists(Set<DictionaryConstraintViolation> errors,
            Set<DictionaryConstraintViolation> warnings) {
        for (val codeListName : dictionary.getCodeListNames()) {
            val collection = codeListIndex.get(codeListName);
            int count = collection.size();
            if (count == 0) {
                warnings.add(new DictionaryConstraintViolation("Missing code list", codeListName));
                break;
            }
            if (count > 1) {
                errors.add(new DictionaryConstraintViolation("Duplicate code lists", collection));
            }

            val codeList = getFirst(collection, null);

            Multiset<String> codes = HashMultiset.create();
            Multiset<String> values = HashMultiset.create();
            for (val term : codeList.getTerms()) {
                codes.add(term.getCode());
                values.add(term.getValue());
            }

            for (val term : codeList.getTerms()) {
                val code = term.getCode();
                val value = term.getValue();

                if (codes.count(code) > 1) {
                    errors.add(
                            new DictionaryConstraintViolation("Duplicate code list codes", term, code, codeList));
                }
                if (values.count(value) > 1) {
                    errors.add(
                            new DictionaryConstraintViolation("Duplicate code list values", term, value, codeList));
                }
                if (codes.contains(value) && !code.equals(value)) {
                    errors.add(new DictionaryConstraintViolation("Non-disjoint code list code and value", term,
                            value, codeList));
                }
            }
        }
    }

    public static class DictionaryIndex {

        final Map<String, FileSchema> schemata = newHashMap();
        final Table<String, String, Field> fields = HashBasedTable.create();
        final Table<String, String, List<Restriction>> restrictions = HashBasedTable.create();
        final Table<String, String, Multimap<RestrictionType, Restriction>> restrictionTypes = HashBasedTable
                .create();

        public DictionaryIndex(Dictionary dictionary) {
            index(dictionary);
        }

        private void index(Dictionary dictionary) {
            for (val schema : dictionary.getFiles()) {
                schemata.put(schema.getName(), schema);
                for (val field : schema.getFields()) {
                    fields.put(schema.getName(), field.getName(), field);
                    restrictions.put(schema.getName(), field.getName(), field.getRestrictions());
                    restrictionTypes.put(schema.getName(), field.getName(), indexByType(field.getRestrictions()));
                }
            }
        }

        private Multimap<RestrictionType, Restriction> indexByType(Iterable<Restriction> restrictions) {
            return Multimaps.index(restrictions, new Function<Restriction, RestrictionType>() {

                @Override
                public RestrictionType apply(Restriction restriction) {
                    return restriction.getType();
                }

            });
        }

        public FileSchema getSchema(String schemaName) {
            return schemata.get(schemaName);
        }

        public boolean hasSchema(String schemaName) {
            return schemata.containsKey(schemaName);
        }

        public Field getField(String schemaName, String fieldName) {
            return fields.get(schemaName, fieldName);
        }

        public boolean hasField(String schemaName, String fieldName) {
            return fields.contains(schemaName, fieldName);
        }

        public List<Restriction> getRestrictions(String schemaName, String fieldName) {
            return restrictions.get(schemaName, fieldName);
        }

        public boolean hasRestrictions(String schemaName, String fieldName) {
            return restrictions.contains(schemaName, fieldName);
        }

        public Set<RestrictionType> getRestrictionTypes(String schemaName, String fieldName) {
            val types = restrictionTypes.get(schemaName, fieldName);
            return types == null ? new HashSet<RestrictionType>() : types.keySet();
        }

        public Multimap<RestrictionType, Restriction> getRestrictionType(String schemaName, String fieldName,
                String type) {
            return restrictionTypes.get(schemaName, fieldName);
        }

        public boolean hasRestrictionType(String schemaName, String fieldName, RestrictionType type) {
            val types = restrictionTypes.get(schemaName, fieldName);
            return types != null && types.containsKey(type);
        }

    }

    public static class CodeListIndex {

        private final Multimap<String, CodeList> codeLists;

        public CodeListIndex(Iterable<CodeList> codeLists) {
            this.codeLists = index(codeLists);
        }

        private Multimap<String, CodeList> index(Iterable<CodeList> codeLists) {
            return Multimaps.index(codeLists, new Function<CodeList, String>() {

                @Override
                public String apply(CodeList codeList) {
                    return codeList.getName();
                }

            });
        }

        public Collection<CodeList> get(String name) {
            return codeLists.get(name);
        }

        public boolean has(String name) {
            return codeLists.containsKey(name);
        }

    }

    @Value
    public static class DictionaryConstraintViolations {

        private final Set<DictionaryConstraintViolation> warnings;
        private final Set<DictionaryConstraintViolation> errors;

        public boolean hasWarnings() {
            return !warnings.isEmpty();
        }

        public boolean hasErrors() {
            return !errors.isEmpty();
        }

    }

    @Value
    public static class DictionaryConstraintViolation {

        private final String description;
        private final Object[] context;

        public DictionaryConstraintViolation(String description, Object... context) {
            this.description = description;
            this.context = context;
        }

    }

}