eionet.meta.imp.VocabularyRDFImportHandler.java Source code

Introduction

Here is the source code for eionet.meta.imp.VocabularyRDFImportHandler.java
Source

/*
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is Data Dictionary
 *
 * The Initial Owner of the Original Code is European Environment
 * Agency. Portions created by TripleDev are Copyright
 * (C) European Environment Agency.  All Rights Reserved.
 *
 * Contributor(s):
 * TripleDev
 */

package eionet.meta.imp;

import eionet.meta.dao.domain.DataElement;
import eionet.meta.dao.domain.VocabularyConcept;
import eionet.meta.service.ServiceException;
import eionet.util.Pair;
import org.apache.commons.lang.StringUtils;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFHandlerException;

import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import eionet.util.Util;
import eionet.meta.exports.VocabularyOutputHelper;

/**
 * Implementation of OpenRDF's {@link RDFHandler} that will be used by implementations of
 * {@link eionet.meta.service.IRDFVocabularyImportService}. Contains callback methods for listening to the content coming from
 * {@link eionet.meta.service.IRDFVocabularyImportService} and loading them into vocabulary.
 *
 * @author enver
 */
// @Configurable
public class VocabularyRDFImportHandler extends VocabularyImportBaseHandler implements RDFHandler {

    /* static constants */
    /**
     * Mapping for predicate ignorance. If a predicate needs to be ignored it can be put in to map with its String value as a key.
     * Value for that key is an instance of Pair holds a Class and a String. Class is used to check the type of Object (in triple).
     * String is a regular expression to match with Object's string value. This works as follows: Triple<Subject S, Predicate P,
     * Object O> T *****if rule set has a Rule (Pair) R in Map for P then *********if O is an instance of R.Class and O.stringValue
     * matches with R.String then **************ignore T
     */
    private static final Map<String, Pair<Class, String>> PREDICATE_IGNORANCE_RULES;

    /**
     * notation attribute of concept.
     */
    private static final String NOTATION = "notation";
    /**
     * label attribute of concept.
     */
    private static final String PREF_LABEL = "prefLabel";
    /**
     * definition attribute of concept.
     */
    private static final String DEFINITION = "definition";
    /**
     * Private static final String Hashing Algorithm for Triples.
     */
    private static final String HASHING_ALGORITHM = "MD5";
    /**
     * Used when getting bytes of a string to hash.
     */
    private static final String DEFAULT_ENCODING_OF_STRINGS = "UTF-8";
    /**
     * used with concept attributes.
     */
    private static final String SKOS_CONCEPT_ATTRIBUTE_NS = "skos";
    /**
     * concept attribute namespaces to update concept fields instead of dataelements.
     */
    private static final List<String> SKOS_CONCEPT_ATTRIBUTES;

    static {
        SKOS_CONCEPT_ATTRIBUTES = new ArrayList<String>();
        SKOS_CONCEPT_ATTRIBUTES.add("notation");
        SKOS_CONCEPT_ATTRIBUTES.add("prefLabel");
        SKOS_CONCEPT_ATTRIBUTES.add("definition");
        PREDICATE_IGNORANCE_RULES = new HashMap<String, Pair<Class, String>>();
        PREDICATE_IGNORANCE_RULES.put(VocabularyOutputHelper.LinkedDataNamespaces.SKOS_NS + "inScheme",
                new Pair<Class, String>(Object.class, "(.)*"));
        PREDICATE_IGNORANCE_RULES.put(VocabularyOutputHelper.LinkedDataNamespaces.RDF_NS + "type",
                new Pair<Class, String>(URI.class,
                        VocabularyOutputHelper.LinkedDataNamespaces.SKOS_NS + "Concept"));
    }

    /* member fields */
    /**
     * bound uri's to vocabulary.
     */
    private Map<String, String> boundURIs = null;

    /**
     * Bound elements of vocabulary.
     */
    protected Map<String, List<String>> boundElements = null;

    /**
     * number of valid triples that are processed.
     */
    private int numberOfValidTriples = 0;
    /**
     * number of total triples that are processed.
     */
    private int totalNumberOfTriples = 0;
    /**
     * Temporary list for an element identifier (not to query all the time).
     */
    private List<DataElement> elementsOfConcept = null;
    /**
     * Temporary list for an element identifier with language (not to query all the time).
     */
    private List<DataElement> elementsOfConceptByLang = null;
    /**
     * Previous successful tiple's subject (concept) identifier.
     */
    private String prevConceptIdentifier = null;
    /**
     * Previous successful triple's predicate (data element) identifier.
     */
    private String prevDataElemIdentifier = null;
    /**
     * Previous successful triple's language.
     */
    private String prevLang = null;
    /**
     * Map to hold dataelement positions.
     */
    private Map<String, Map<String, Integer>> attributePositions = null;
    /**
     * Temporary object to hold last found concept not to iterate over again in lists.
     */
    private VocabularyConcept lastFoundConcept;
    /**
     * This map is used to detect if a predicate (prefLabel, definition or notation) is used for concept update or not.
     */
    private Map<String, Set<Integer>> conceptsUpdatedForAttributes = null;
    /**
     * Number of concepts updated per predicate.
     */
    private Map<String, Set<Integer>> predicateUpdatesAtConcepts = null;
    /**
     * Boolean to create new data elements for predicates.
     */
    private boolean createNewDataElementsForPredicates = false;
    /**
     * This set includes predicates which are not bound to vocabulary.
     */
    private Set<String> notBoundPredicates = null;
    /**
     * This is map to get data element identifier for a predicate.
     */
    private Map<String, String> identifierOfPredicate = null;
    /**
     * Working language, should be two letters language code in lower case.
     */
    private String workingLanguage = null;
    /**
     * Local instance namespace.
     */
    private final String ddNamespace;
    /**
     * Number of duplicated triples.
     */
    private int numberOfDuplicatedTriples = 0;
    /**
     * In this set seen statements hascodes are stored not to process same statement once again.
     */
    private Set<BigInteger> seenStatementsHashCodes = null;
    /**
     * Message Digest instance used for triple hashing.
     */
    private MessageDigest messageDigestInstance = null;
    /**
     * This map store last seen candidate for DEFINITION and LABEL. Key value should be conceptId+dataelemIdentifier.
     */
    private Map<String, Literal> lastCandidateForConceptAttribute = null;

    /**
     * Constructor for RDFHandler to import rdf into vocabulary.
     *
     * @param folderContextRoot
     *            base uri for vocabulary.
     * @param concepts
     *            concepts of vocabulary
     * @param boundElements
     *            bound elements to vocabulary.
     * @param boundElementsToIds
     *            bound elements ids.
     * @param boundURIs
     *            rdf namespaces for bound elements
     * @param workingLanguage
     *            working language, only first two letters are used
     * @param createNewDataElementsForPredicates
     *            create new data elements for seen predicates
     * @param ddNamespace
     *            dd instance namespace
     * @throws ServiceException
     *             when digest algorithm cannot be found
     */
    public VocabularyRDFImportHandler(String folderContextRoot, List<VocabularyConcept> concepts,
            Map<String, Integer> boundElementsToIds, Map<String, List<String>> boundElements,
            Map<String, String> boundURIs, boolean createNewDataElementsForPredicates, String workingLanguage,
            String ddNamespace) throws ServiceException {
        super(folderContextRoot, concepts, boundElementsToIds);
        this.boundElements = boundElements;
        this.createNewDataElementsForPredicates = createNewDataElementsForPredicates;
        this.boundURIs = boundURIs;
        this.attributePositions = new HashMap<String, Map<String, Integer>>();
        this.predicateUpdatesAtConcepts = new HashMap<String, Set<Integer>>();
        this.notBoundPredicates = new HashSet<String>();
        this.identifierOfPredicate = new HashMap<String, String>();
        this.conceptsUpdatedForAttributes = new HashMap<String, Set<Integer>>();
        this.conceptsUpdatedForAttributes.put(SKOS_CONCEPT_ATTRIBUTE_NS + ":" + PREF_LABEL, new HashSet<Integer>());
        this.conceptsUpdatedForAttributes.put(SKOS_CONCEPT_ATTRIBUTE_NS + ":" + DEFINITION, new HashSet<Integer>());
        this.conceptsUpdatedForAttributes.put(SKOS_CONCEPT_ATTRIBUTE_NS + ":" + NOTATION, new HashSet<Integer>());
        this.lastCandidateForConceptAttribute = new HashMap<String, Literal>();
        // get first two letters of working language since, it can be like en-US
        this.workingLanguage = StringUtils.substring(workingLanguage, 0, 2);
        this.ddNamespace = ddNamespace;
        this.seenStatementsHashCodes = new HashSet<BigInteger>();
        try {
            this.messageDigestInstance = MessageDigest.getInstance(HASHING_ALGORITHM);
        } catch (NoSuchAlgorithmException e) {
            throw new ServiceException(e.getMessage());
        }
    } // end of constructor

    @Override
    public void startRDF() throws RDFHandlerException {
    } // end of method startRDF

    @Override
    public void handleComment(String comment) throws RDFHandlerException {
    } // end of method handleComment

    @Override
    public void handleNamespace(String prefix, String uri) throws RDFHandlerException {
    } // end of method handleNamespace

    @Override
    public void handleStatement(Statement st) throws RDFHandlerException {
        this.totalNumberOfTriples++;
        Resource subject = st.getSubject();
        URI predicate = st.getPredicate();
        Value object = st.getObject();

        if (!(subject instanceof URI)) {
            // this.logMessages.add(st.toString() + " NOT imported, subject is not a URI");
            return;
        }

        // object should a resource or a literal (value)
        if (!(object instanceof URI) && !(object instanceof Literal)) {
            // this.logMessages.add(st.toString() + " NOT imported, object is not instance of URI or Literal");
            return;
        }

        String conceptUri = subject.stringValue();
        if (!StringUtils.startsWith(conceptUri, this.folderContextRoot)) {
            // this.logMessages.add(st.toString() + " NOT imported, does not have base URI");
            return;
        }

        this.messageDigestInstance.reset();
        byte[] digested;
        try {
            digested = this.messageDigestInstance.digest(st.toString().getBytes(DEFAULT_ENCODING_OF_STRINGS));
        } catch (UnsupportedEncodingException e) {
            throw new RDFHandlerException(e);
        }
        BigInteger statementHashCode = new BigInteger(1, digested);
        if (this.seenStatementsHashCodes.contains(statementHashCode)) {
            // this.logMessages.add(st.toString() + " NOT imported, duplicates a previous triple");
            this.numberOfDuplicatedTriples++;
            return;
        }
        this.seenStatementsHashCodes.add(statementHashCode);

        // if it does not a have conceptIdentifier than it may be an attribute for vocabulary or a wrong record, so just ignore it
        String conceptIdentifier = conceptUri.replace(this.folderContextRoot, "");
        if (StringUtils.contains(conceptIdentifier, "/") || !Util.isValidIdentifier(conceptIdentifier)) {
            // this.logMessages.add(st.toString() + " NOT imported, contains a / in concept identifier or empty");
            return;
        }

        String predicateUri = predicate.stringValue();

        Pair<Class, String> ignoranceRule = PREDICATE_IGNORANCE_RULES.get(predicateUri);
        if (ignoranceRule != null) {
            if (ignoranceRule.getLeft().isInstance(object)
                    && object.stringValue().matches(ignoranceRule.getRight())) {
                // ignore value
                return;
            }
        }

        String attributeIdentifier = null;
        String predicateNS = null;

        boolean candidateForConceptAttribute = false;
        if (StringUtils.startsWith(predicateUri, VocabularyOutputHelper.LinkedDataNamespaces.SKOS_NS)) {
            attributeIdentifier = predicateUri.replace(VocabularyOutputHelper.LinkedDataNamespaces.SKOS_NS, "");
            candidateForConceptAttribute = SKOS_CONCEPT_ATTRIBUTES.contains(attributeIdentifier);
            if (candidateForConceptAttribute) {
                predicateNS = SKOS_CONCEPT_ATTRIBUTE_NS;
            }
        }

        if (candidateForConceptAttribute && !(object instanceof Literal)) {
            // this.logMessages.add(st.toString() + " NOT imported, object is not a Literal for concept attribute");
            return;
        }

        if (!candidateForConceptAttribute) {
            for (String key : this.boundURIs.keySet()) {
                if (StringUtils.startsWith(predicateUri, key)) {
                    attributeIdentifier = predicateUri.replace(key, "");
                    predicateNS = this.boundURIs.get(key);
                    if (!this.boundElements.get(predicateNS).contains(attributeIdentifier)) {
                        predicateNS = null;
                    }
                    break;
                }
            }
        }

        if (StringUtils.isEmpty(predicateNS)) {
            // this.logMessages.add(st.toString() + " NOT imported, predicate is not a bound URI nor a concept attribute");
            this.notBoundPredicates.add(predicateUri);
            return;
        }

        // if execution comes here so we have a valid triple to import
        // first find the concept
        if (!StringUtils.equals(conceptIdentifier, this.prevConceptIdentifier)) {
            this.prevDataElemIdentifier = null;
            this.prevLang = null;
            this.lastFoundConcept = null;
        }
        this.prevConceptIdentifier = conceptIdentifier;

        if (this.lastFoundConcept == null) {
            Pair<VocabularyConcept, Boolean> foundConceptWithFlag = findOrCreateConcept(conceptIdentifier);
            // if vocabulary concept couldnt find or couldnt be created
            if (foundConceptWithFlag == null) {
                return;
            }

            this.lastFoundConcept = foundConceptWithFlag.getLeft();
            if (!foundConceptWithFlag.getRight()) {
                // vocabulary concept found or created, add it to list
                this.toBeUpdatedConcepts.add(this.lastFoundConcept);
            }
        }

        String dataElemIdentifier = predicateNS + ":" + attributeIdentifier;
        if (StringUtils.equals(this.ddNamespace, predicateNS)) {
            dataElemIdentifier = attributeIdentifier;
        }

        // TODO code below can be refactored
        if (candidateForConceptAttribute && !this.conceptsUpdatedForAttributes.get(dataElemIdentifier)
                .contains(this.lastFoundConcept.getId())) {
            this.conceptsUpdatedForAttributes.get(dataElemIdentifier).add(this.lastFoundConcept.getId());
            // update concept value here
            String val = StringUtils.trimToNull(object.stringValue());
            if (StringUtils.equals(attributeIdentifier, NOTATION)) {
                this.lastFoundConcept.setNotation(val);
            } else {
                if (StringUtils.equals(attributeIdentifier, DEFINITION)) {
                    this.lastFoundConcept.setDefinition(val);
                } else if (StringUtils.equals(attributeIdentifier, PREF_LABEL)) {
                    this.lastFoundConcept.setLabel(val);
                }
                String elemLang = StringUtils.substring(((Literal) object).getLanguage(), 0, 2);
                if (StringUtils.isNotBlank(elemLang)) {
                    this.lastCandidateForConceptAttribute.put(this.lastFoundConcept.getId() + dataElemIdentifier,
                            (Literal) object);
                    candidateForConceptAttribute = false;
                }
            }
        } else if (candidateForConceptAttribute && this.lastCandidateForConceptAttribute
                .containsKey(this.lastFoundConcept.getId() + dataElemIdentifier)) {
            // check if more prior value received
            Literal previousCandidate = this.lastCandidateForConceptAttribute
                    .remove(this.lastFoundConcept.getId() + dataElemIdentifier);

            String elemLang = StringUtils.substring(((Literal) object).getLanguage(), 0, 2);
            boolean updateValue = false;
            if (StringUtils.isEmpty(elemLang)) {
                updateValue = true;
            } else if (StringUtils.equals(elemLang, this.workingLanguage) && !StringUtils
                    .equals(StringUtils.substring(previousCandidate.getLanguage(), 0, 2), this.workingLanguage)) {
                updateValue = true;
                candidateForConceptAttribute = false;
                this.lastCandidateForConceptAttribute.put(this.lastFoundConcept.getId() + dataElemIdentifier,
                        (Literal) object);
            } else {
                this.lastCandidateForConceptAttribute.put(this.lastFoundConcept.getId() + dataElemIdentifier,
                        previousCandidate);
                candidateForConceptAttribute = false;
            }

            if (updateValue) {
                String val = StringUtils.trimToNull(object.stringValue());
                if (StringUtils.equals(attributeIdentifier, DEFINITION)) {
                    this.lastFoundConcept.setDefinition(val);
                } else if (StringUtils.equals(attributeIdentifier, PREF_LABEL)) {
                    this.lastFoundConcept.setLabel(val);
                }
            }
        } else {
            candidateForConceptAttribute = false;
        }

        if (!candidateForConceptAttribute) {
            if (!this.boundElementsIds.containsKey(dataElemIdentifier)) {
                this.notBoundPredicates.add(predicateUri);
                return;
            }

            Set<Integer> conceptIdsUpdatedWithPredicate = this.predicateUpdatesAtConcepts.get(predicateUri);
            if (conceptIdsUpdatedWithPredicate == null) {
                conceptIdsUpdatedWithPredicate = new HashSet<Integer>();
                this.predicateUpdatesAtConcepts.put(predicateUri, conceptIdsUpdatedWithPredicate);
            }
            // find the data element
            if (!this.identifierOfPredicate.containsKey(predicateUri)) {
                this.identifierOfPredicate.put(predicateUri, dataElemIdentifier);
            }
            if (!StringUtils.equals(dataElemIdentifier, this.prevDataElemIdentifier)) {
                elementsOfConcept = getDataElementValuesByName(dataElemIdentifier,
                        lastFoundConcept.getElementAttributes());
                if (createNewDataElementsForPredicates
                        && !conceptIdsUpdatedWithPredicate.contains(lastFoundConcept.getId())) {
                    if (this.elementsOfConcept != null) {
                        this.lastFoundConcept.getElementAttributes().remove(this.elementsOfConcept);
                    }
                    this.elementsOfConcept = null;
                }

                if (this.elementsOfConcept == null) {
                    this.elementsOfConcept = new ArrayList<DataElement>();
                    this.lastFoundConcept.getElementAttributes().add(this.elementsOfConcept);
                }
            }

            String elementValue = object.stringValue();
            if (StringUtils.isEmpty(elementValue)) {
                // value is empty, no need to continue
                return;
            }
            String elemLang = null;
            VocabularyConcept foundRelatedConcept = null;
            // if object is a resource (i.e. URI), it can be a related concept
            if (object instanceof URI) {
                foundRelatedConcept = findRelatedConcept(elementValue);
            } else if (object instanceof Literal) {
                // it is literal
                elemLang = StringUtils.substring(((Literal) object).getLanguage(), 0, 2);
            }

            if (!StringUtils.equals(dataElemIdentifier, prevDataElemIdentifier)
                    || !StringUtils.equals(elemLang, prevLang)) {
                elementsOfConceptByLang = getDataElementValuesByNameAndLang(dataElemIdentifier, elemLang,
                        lastFoundConcept.getElementAttributes());
            }
            this.prevLang = elemLang;
            this.prevDataElemIdentifier = dataElemIdentifier;

            // check for pre-existence of the VCE by attribute value or related concept id
            Integer relatedId = null;
            if (foundRelatedConcept != null) {
                relatedId = foundRelatedConcept.getId();
            }
            for (DataElement elemByLang : elementsOfConceptByLang) {
                String elementValueByLang = elemByLang.getAttributeValue();
                if (StringUtils.equals(elementValue, elementValueByLang)) {
                    // vocabulary concept element already in database, no need to continue, return
                    return;
                }
                if (relatedId != null) {
                    Integer relatedConceptId = elemByLang.getRelatedConceptId();
                    if (relatedConceptId != null && relatedConceptId.intValue() == relatedId.intValue()) {
                        // vocabulary concept element already in database, no need to continue, return
                        return;
                    }
                }
            }

            // create VCE
            DataElement elem = new DataElement();
            this.elementsOfConcept.add(elem);
            elem.setAttributeLanguage(elemLang);
            elem.setIdentifier(dataElemIdentifier);
            elem.setId(this.boundElementsIds.get(dataElemIdentifier));
            // check if there is a found related concept
            if (foundRelatedConcept != null) {
                elem.setRelatedConceptIdentifier(foundRelatedConcept.getIdentifier());
                int id = foundRelatedConcept.getId();
                elem.setRelatedConceptId(id);
                elem.setAttributeValue(null);
                if (id < 0) {
                    addToElementsReferringNotCreatedConcepts(id, elem);
                }
            } else {
                elem.setAttributeValue(elementValue);
                elem.setRelatedConceptId(null);
            }

            conceptIdsUpdatedWithPredicate.add(this.lastFoundConcept.getId());
        }
        this.numberOfValidTriples++;
    } // end of method handleStatement

    @Override
    public void endRDF() throws RDFHandlerException {
        if (this.createNewDataElementsForPredicates) {
            // if purge per predicate is selected and rdf does not contain any for some concepts. Then those untouched concepts
            // should be updated to remove these predicates if they have
            // 1. first do it for toBeUpdateConcepts
            for (VocabularyConcept concept : this.toBeUpdatedConcepts) {
                for (String key : this.predicateUpdatesAtConcepts.keySet()) {
                    if (!this.predicateUpdatesAtConcepts.get(key).contains(concept.getId())) {
                        List<DataElement> conceptElements = getDataElementValuesByName(
                                this.identifierOfPredicate.get(key), concept.getElementAttributes());
                        if (conceptElements != null && conceptElements.size() > 0) {
                            concept.getElementAttributes().remove(conceptElements);
                        }
                    }
                }
            }
            // 2. do it for untouched concepts
            for (VocabularyConcept concept : this.concepts) {
                boolean conceptUpdated = false;
                for (String key : this.predicateUpdatesAtConcepts.keySet()) {
                    List<DataElement> conceptElements = getDataElementValuesByName(
                            this.identifierOfPredicate.get(key), concept.getElementAttributes());
                    if (conceptElements != null && conceptElements.size() > 0) {
                        concept.getElementAttributes().remove(conceptElements);
                        conceptUpdated = true;
                    }
                }
                if (conceptUpdated) {
                    this.toBeUpdatedConcepts.add(concept);
                }
            }
        }

        // check for null label containing concepts
        List<String> conceptsWithNullLabels = processNewlyCreatedConceptsForNullCheck();
        // process unseen concepts for related elements
        processUnseenConceptsForRelatedElements();

        // add some logs
        this.logMessages
                .add("Valid (" + this.numberOfValidTriples + ") / Total (" + this.totalNumberOfTriples + ")");
        // this.logMessages.add("Found related concept cache count: " + this.relatedConceptCache.keySet().size());
        this.logMessages.add("Number of predicates seen: " + this.predicateUpdatesAtConcepts.size());
        this.logMessages.add("Number of duplicate triples: " + this.numberOfDuplicatedTriples);
        this.logMessages.add("Number of concepts seen per predicate: ");
        for (String key : this.predicateUpdatesAtConcepts.keySet()) {
            this.logMessages.add("--> " + key + " (" + this.identifierOfPredicate.get(key) + "): "
                    + this.predicateUpdatesAtConcepts.get(key).size());
        }
        this.logMessages.add("Not imported predicates (" + this.notBoundPredicates.size()
                + ") which are not bound to vocabulary: ");
        for (String predicate : this.notBoundPredicates) {
            this.logMessages.add("--> " + predicate);
        }
        if (conceptsWithNullLabels != null) {
            this.logMessages.addAll(conceptsWithNullLabels);
        }
        this.logMessages.add("Number of updated concepts: " + this.toBeUpdatedConcepts.size());
    } // end of method endRDF

} // end of class VocabularyRDFImportHandler