org.dswarm.graph.batch.Neo4jProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.dswarm.graph.batch.Neo4jProcessor.java

Source

/**
 * This file is part of d:swarm graph extension.
 *
 * d:swarm graph extension is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * d:swarm graph extension is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with d:swarm graph extension.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.dswarm.graph.batch;

import java.util.HashMap;
import java.util.Map;

import org.dswarm.graph.DMPGraphException;
import org.dswarm.graph.GraphIndexStatics;
import org.dswarm.graph.NodeType;
import org.dswarm.graph.hash.HashUtils;
import org.dswarm.graph.model.GraphStatics;
import org.neo4j.graphdb.DynamicLabel;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.index.IndexHits;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
import org.neo4j.unsafe.batchinsert.BatchInserter;
import org.neo4j.unsafe.batchinsert.BatchInserterIndex;
import org.neo4j.unsafe.batchinsert.BatchInserterIndexProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.carrotsearch.hppc.LongLongOpenHashMap;
import com.carrotsearch.hppc.LongObjectOpenHashMap;
import com.carrotsearch.hppc.ObjectLongOpenHashMap;
import com.github.emboss.siphash.SipHash;
import com.github.emboss.siphash.SipKey;
import com.google.common.base.Charsets;
import com.google.common.base.Optional;

/**
 * @author tgaengler
 */
public abstract class Neo4jProcessor {

    private static final Logger LOG = LoggerFactory.getLogger(Neo4jProcessor.class);

    protected int addedLabels = 0;

    private static final SipKey SPEC_KEY = new SipKey(HashUtils.bytesOf(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
            0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f));

    protected final BatchInserter inserter;
    private BatchInserterIndex resources;
    private BatchInserterIndex resourcesWDataModel;
    private BatchInserterIndex resourceTypes;

    protected final ObjectLongOpenHashMap<String> tempResourcesIndex;
    protected final ObjectLongOpenHashMap<String> tempResourcesWDataModelIndex;
    protected final ObjectLongOpenHashMap<String> tempResourceTypes;

    private BatchInserterIndex values;
    protected final ObjectLongOpenHashMap<String> bnodes;
    private BatchInserterIndex statementHashes;

    protected final LongLongOpenHashMap tempStatementHashes;

    protected final LongObjectOpenHashMap<String> nodeResourceMap;

    public Neo4jProcessor(final BatchInserter inserter) throws DMPGraphException {

        this.inserter = inserter;

        Neo4jProcessor.LOG.debug("start writing");

        bnodes = new ObjectLongOpenHashMap<>();
        nodeResourceMap = new LongObjectOpenHashMap<>();

        tempResourcesIndex = new ObjectLongOpenHashMap<>();
        tempResourcesWDataModelIndex = new ObjectLongOpenHashMap<>();
        tempResourceTypes = new ObjectLongOpenHashMap<>();
        tempStatementHashes = new LongLongOpenHashMap();

        // TODO: init all indices, when batch inserter should work on a pre-filled database (otherwise, the existing index would
        // utilised in the first run)
        // initIndices();
        initValueIndex();
    }

    protected void pumpNFlushNClearIndices() {

        Neo4jProcessor.LOG.debug("start pumping indices");

        copyNFlushNClearIndex(tempResourcesIndex, resources, GraphStatics.URI,
                GraphIndexStatics.RESOURCES_INDEX_NAME);
        copyNFlushNClearIndex(tempResourcesWDataModelIndex, resourcesWDataModel, GraphStatics.URI_W_DATA_MODEL,
                GraphIndexStatics.RESOURCES_W_DATA_MODEL_INDEX_NAME);
        copyNFlushNClearIndex(tempResourceTypes, resourceTypes, GraphStatics.URI,
                GraphIndexStatics.RESOURCE_TYPES_INDEX_NAME);
        copyNFlushNClearLongIndex(tempStatementHashes, statementHashes, GraphStatics.HASH,
                GraphIndexStatics.STATEMENT_HASHES_INDEX_NAME);

        Neo4jProcessor.LOG.debug("finished pumping indices");
    }

    private void copyNFlushNClearIndex(final ObjectLongOpenHashMap<String> tempIndex,
            final BatchInserterIndex neo4jIndex, final String indexProperty, final String indexName) {

        Neo4jProcessor.LOG.debug("start pumping '" + indexName + "' index of size '" + tempIndex.size() + "'");

        final Object[] keys = tempIndex.keys;
        final long[] values = tempIndex.values;
        final boolean[] states = tempIndex.allocated;

        Neo4jProcessor.LOG.debug("keys size = '" + keys.length + "' :: values size = '" + values.length
                + "' :: states size = '" + states.length + "'");

        int j = 0;
        long tick = System.currentTimeMillis();
        int sinceLast = 0;

        for (int i = 0; i < states.length; i++) {

            if (states[i]) {

                // @tgaengler: I can't remember why I'm utilising an char array here ...
                neo4jIndex.add(values[i], MapUtil.map(indexProperty, keys[i].toString().toCharArray()));

                j++;

                final int entryDelta = j - sinceLast;
                final long timeDelta = (System.currentTimeMillis() - tick) / 1000;

                if (entryDelta >= 1000000 || timeDelta >= 60) {

                    sinceLast = j;

                    Neo4jProcessor.LOG.debug(
                            "wrote '" + j + "' entries @ ~" + (double) entryDelta / timeDelta + " entries/second.");

                    tick = System.currentTimeMillis();
                }
            }
        }

        Neo4jProcessor.LOG.debug("finished pumping '" + indexName + "' index; wrote '" + j + "' entries");

        Neo4jProcessor.LOG.debug("start flushing and clearing index");

        neo4jIndex.flush();
        tempIndex.clear();

        Neo4jProcessor.LOG.debug("finished flushing and clearing index");
    }

    private void copyNFlushNClearLongIndex(final LongLongOpenHashMap tempIndex, final BatchInserterIndex neo4jIndex,
            final String indexProperty, final String indexName) {

        Neo4jProcessor.LOG.debug("start pumping '" + indexName + "' index of size '" + tempIndex.size() + "'");

        final long[] keys = tempIndex.keys;
        final long[] values = tempIndex.values;
        final boolean[] states = tempIndex.allocated;

        Neo4jProcessor.LOG.debug("keys size = '" + keys.length + "' :: values size = '" + values.length
                + "' :: states size = '" + states.length + "'");

        int j = 0;
        long tick = System.currentTimeMillis();
        int sinceLast = 0;

        for (int i = 0; i < states.length; i++) {

            if (states[i]) {

                neo4jIndex.add(values[i], MapUtil.map(indexProperty, keys[i]));

                j++;

                final int entryDelta = j - sinceLast;
                final long timeDelta = (System.currentTimeMillis() - tick) / 1000;

                if (entryDelta >= 1000000 || timeDelta >= 60) {

                    sinceLast = j;

                    Neo4jProcessor.LOG.debug(
                            "wrote '" + j + "' entries @ ~" + (double) entryDelta / timeDelta + " entries/second.");

                    tick = System.currentTimeMillis();
                }
            }
        }

        Neo4jProcessor.LOG.debug("finished pumping index '" + indexName + "' index; wrote '" + j + "' entries");

        Neo4jProcessor.LOG.debug("start flushing and clearing index");

        neo4jIndex.flush();
        tempIndex.clear();

        Neo4jProcessor.LOG.debug("finished flushing and clearing index");
    }

    protected void initValueIndex() throws DMPGraphException {

        try {

            values = getOrCreateIndex(GraphIndexStatics.VALUES_INDEX_NAME, GraphStatics.VALUE, true, 1);
        } catch (final Exception e) {

            final String message = "couldn't load indices successfully";

            Neo4jProcessor.LOG.error(message, e);
            Neo4jProcessor.LOG.debug("couldn't finish writing successfully");

            throw new DMPGraphException(message);
        }
    }

    protected void initIndices() throws DMPGraphException {

        try {

            resources = getOrCreateIndex(GraphIndexStatics.RESOURCES_INDEX_NAME, GraphStatics.URI, true, 1);
            resourcesWDataModel = getOrCreateIndex(GraphIndexStatics.RESOURCES_W_DATA_MODEL_INDEX_NAME,
                    GraphStatics.URI_W_DATA_MODEL, true, 1);
            resourceTypes = getOrCreateIndex(GraphIndexStatics.RESOURCE_TYPES_INDEX_NAME, GraphStatics.URI, true,
                    1);
            statementHashes = getOrCreateIndex(GraphIndexStatics.STATEMENT_HASHES_INDEX_NAME, GraphStatics.HASH,
                    false, 1000000);
        } catch (final Exception e) {

            final String message = "couldn't load indices successfully";

            Neo4jProcessor.LOG.error(message, e);
            Neo4jProcessor.LOG.debug("couldn't finish writing successfully");

            throw new DMPGraphException(message);
        }
    }

    public BatchInserter getBatchInserter() {

        return inserter;
    }

    public void addToResourcesIndex(final String key, final long nodeId) {

        tempResourcesIndex.put(key, nodeId);
    }

    public Optional<Long> getNodeIdFromResourcesIndex(final String key) {

        return getIdFromIndex(key, tempResourcesIndex, resources, GraphStatics.URI);
    }

    public void addToResourcesWDataModelIndex(final String key, final long nodeId) {

        tempResourcesWDataModelIndex.put(key, nodeId);
    }

    public Optional<Long> getNodeIdFromResourcesWDataModelIndex(final String key) {

        return getIdFromIndex(key, tempResourcesWDataModelIndex, resourcesWDataModel,
                GraphStatics.URI_W_DATA_MODEL);
    }

    public void addToBNodesIndex(final String key, final long nodeId) {

        bnodes.put(key, nodeId);
    }

    public Optional<Long> getNodeIdFromBNodesIndex(final String key) {

        if (key == null) {

            return Optional.absent();
        }

        if (bnodes.containsKey(key)) {

            return Optional.of(bnodes.lget());
        }

        return Optional.absent();
    }

    public void addToResourceTypesIndex(final String key, final long nodeId) {

        tempResourceTypes.put(key, nodeId);
    }

    public Optional<Long> getNodeIdFromResourceTypesIndex(final String key) {

        return getIdFromIndex(key, tempResourceTypes, resourceTypes, GraphStatics.URI);
    }

    public void addToValueIndex(final String key, final long nodeId) {

        values.add(nodeId, MapUtil.map(GraphStatics.VALUE, key));
    }

    public void addToStatementIndex(final long key, final long nodeId) {

        tempStatementHashes.put(key, nodeId);
    }

    public void flushIndices() throws DMPGraphException {

        Neo4jProcessor.LOG.debug("start flushing indices");

        if (resources == null) {

            initIndices();
        }

        pumpNFlushNClearIndices();
        flushStatementIndices();
        clearTempIndices();

        Neo4jProcessor.LOG.debug("start finished flushing indices");
    }

    public void flushStatementIndices() {

        // statementHashes.flush();
    }

    protected void clearTempIndices() {

        clearTempStatementIndices();
    }

    protected void clearTempStatementIndices() {

        tempStatementHashes.clear();
    }

    public void clearMaps() {

        nodeResourceMap.clear();
        bnodes.clear();
    }

    public Optional<Long> determineNode(final Optional<NodeType> optionalResourceNodeType,
            final Optional<String> optionalResourceId, final Optional<String> optionalResourceURI,
            final Optional<String> optionalDataModelURI) {

        if (!optionalResourceNodeType.isPresent()) {

            return Optional.absent();
        }

        if (NodeType.Resource.equals(optionalResourceNodeType.get())
                || NodeType.TypeResource.equals(optionalResourceNodeType.get())) {

            // resource node

            final Optional<Long> optionalNodeId;

            if (!NodeType.TypeResource.equals(optionalResourceNodeType.get())) {

                if (!optionalDataModelURI.isPresent()) {

                    optionalNodeId = getResourceNodeHits(optionalResourceURI.get());
                } else {

                    optionalNodeId = getNodeIdFromResourcesWDataModelIndex(
                            optionalResourceURI.get() + optionalDataModelURI.get());
                }
            } else {

                optionalNodeId = getNodeIdFromResourceTypesIndex(optionalResourceURI.get());
            }

            return optionalNodeId;
        }

        if (NodeType.Literal.equals(optionalResourceNodeType.get())) {

            // literal node - should never be the case

            return Optional.absent();
        }

        // resource must be a blank node

        return getNodeIdFromBNodesIndex(optionalResourceId.get());
    }

    public Optional<String> determineResourceUri(final long subjectNodeId,
            final Optional<NodeType> optionalSubjectNodeType, final Optional<String> optionalSubjectURI,
            final Optional<String> optionalResourceURI) {

        final Optional<String> optionalResourceUri;

        if (nodeResourceMap.containsKey(subjectNodeId)) {

            optionalResourceUri = Optional.of(nodeResourceMap.lget());
        } else {

            optionalResourceUri = determineResourceUri(optionalSubjectNodeType, optionalSubjectURI,
                    optionalResourceURI);

            if (optionalResourceUri.isPresent()) {

                nodeResourceMap.put(subjectNodeId, optionalResourceUri.get());
            }
        }

        return optionalResourceUri;
    }

    public Optional<String> determineResourceUri(final Optional<NodeType> optionalSubjectNodeType,
            final Optional<String> optionalSubjectURI, final Optional<String> optionalResourceURI) {

        final Optional<String> optionalResourceUri;

        if (optionalSubjectNodeType.isPresent() && (NodeType.Resource.equals(optionalSubjectNodeType.get())
                || NodeType.TypeResource.equals(optionalSubjectNodeType.get()))) {

            optionalResourceUri = optionalSubjectURI;
        } else if (optionalResourceURI.isPresent()) {

            optionalResourceUri = optionalResourceURI;
        } else {

            // shouldn't never be the case

            return Optional.absent();
        }

        return optionalResourceUri;
    }

    public void addLabel(final long nodeId, final String labelString) {

        final Label label = DynamicLabel.label(labelString);

        inserter.setNodeLabels(nodeId, label);
    }

    public Optional<Long> getStatement(final long hash) throws DMPGraphException {

        return getIdFromLongIndex(hash, tempStatementHashes, statementHashes, GraphStatics.HASH);
    }

    public Map<String, Object> prepareRelationship(final String statementUUID,
            final Optional<Map<String, Object>> optionalQualifiedAttributes) {

        final Map<String, Object> relProperties = new HashMap<>();

        relProperties.put(GraphStatics.UUID_PROPERTY, statementUUID);

        if (optionalQualifiedAttributes.isPresent()) {

            final Map<String, Object> qualifiedAttributes = optionalQualifiedAttributes.get();

            if (qualifiedAttributes.containsKey(GraphStatics.ORDER_PROPERTY)) {

                relProperties.put(GraphStatics.ORDER_PROPERTY,
                        qualifiedAttributes.get(GraphStatics.ORDER_PROPERTY));
            }

            if (qualifiedAttributes.containsKey(GraphStatics.INDEX_PROPERTY)) {

                relProperties.put(GraphStatics.INDEX_PROPERTY,
                        qualifiedAttributes.get(GraphStatics.INDEX_PROPERTY));
            }

            // TODO: versioning handling only implemented for data models right now

            if (qualifiedAttributes.containsKey(GraphStatics.EVIDENCE_PROPERTY)) {

                relProperties.put(GraphStatics.EVIDENCE_PROPERTY,
                        qualifiedAttributes.get(GraphStatics.EVIDENCE_PROPERTY));
            }

            if (qualifiedAttributes.containsKey(GraphStatics.CONFIDENCE_PROPERTY)) {

                relProperties.put(GraphStatics.CONFIDENCE_PROPERTY,
                        qualifiedAttributes.get(GraphStatics.CONFIDENCE_PROPERTY));
            }
        }

        return relProperties;
    }

    public long generateStatementHash(final long subjectNodeId, final String predicateName, final long objectNodeId,
            final NodeType subjectNodeType, final NodeType objectNodeType) throws DMPGraphException {

        final Optional<NodeType> optionalSubjectNodeType = Optional.fromNullable(subjectNodeType);
        final Optional<NodeType> optionalObjectNodeType = Optional.fromNullable(objectNodeType);
        final Optional<String> optionalSubjectIdentifier = getIdentifier(subjectNodeId, optionalSubjectNodeType);
        final Optional<String> optionalObjectIdentifier = getIdentifier(objectNodeId, optionalObjectNodeType);

        return generateStatementHash(predicateName, optionalSubjectNodeType, optionalObjectNodeType,
                optionalSubjectIdentifier, optionalObjectIdentifier);
    }

    public long generateStatementHash(final long subjectNodeId, final String predicateName,
            final String objectValue, final NodeType subjectNodeType, final NodeType objectNodeType)
            throws DMPGraphException {

        final Optional<NodeType> optionalSubjectNodeType = Optional.fromNullable(subjectNodeType);
        final Optional<NodeType> optionalObjectNodeType = Optional.fromNullable(objectNodeType);
        final Optional<String> optionalSubjectIdentifier = getIdentifier(subjectNodeId, optionalSubjectNodeType);
        final Optional<String> optionalObjectIdentifier = Optional.fromNullable(objectValue);

        return generateStatementHash(predicateName, optionalSubjectNodeType, optionalObjectNodeType,
                optionalSubjectIdentifier, optionalObjectIdentifier);
    }

    public long generateStatementHash(final String predicateName, final Optional<NodeType> optionalSubjectNodeType,
            final Optional<NodeType> optionalObjectNodeType, final Optional<String> optionalSubjectIdentifier,
            final Optional<String> optionalObjectIdentifier) throws DMPGraphException {

        if (!optionalSubjectNodeType.isPresent() || !optionalObjectNodeType.isPresent()
                || !optionalSubjectIdentifier.isPresent() || !optionalObjectIdentifier.isPresent()) {

            final String message = "cannot generate statement hash, because the subject node type or object node type or subject identifier or object identifier is not present";

            Neo4jProcessor.LOG.error(message);

            throw new DMPGraphException(message);
        }

        final String hashString = optionalSubjectNodeType.toString() + ":" + optionalSubjectIdentifier.get() + " "
                + predicateName + " " + optionalObjectNodeType.toString() + ":" + optionalObjectIdentifier.get()
                + " ";

        return SipHash.digest(Neo4jProcessor.SPEC_KEY, hashString.getBytes(Charsets.UTF_8));
    }

    public Optional<String> getIdentifier(final long nodeId, final Optional<NodeType> optionalNodeType) {

        if (!optionalNodeType.isPresent()) {

            return Optional.absent();
        }

        final String identifier;

        switch (optionalNodeType.get()) {

        case Resource:
        case TypeResource:

            final String uri = (String) getProperty(GraphStatics.URI_PROPERTY, inserter.getNodeProperties(nodeId));
            final String dataModel = (String) getProperty(GraphStatics.DATA_MODEL_PROPERTY,
                    inserter.getNodeProperties(nodeId));

            if (dataModel == null) {

                identifier = uri;
            } else {

                identifier = uri + dataModel;
            }

            break;
        case BNode:
        case TypeBNode:

            identifier = "" + nodeId;

            break;
        case Literal:

            identifier = (String) getProperty(GraphStatics.VALUE_PROPERTY, inserter.getNodeProperties(nodeId));

            break;
        default:

            identifier = null;

            break;
        }

        return Optional.fromNullable(identifier);
    }

    public abstract void addObjectToResourceWDataModelIndex(final long nodeId, final String URI,
            final Optional<String> optionalDataModelURI);

    public abstract void handleObjectDataModel(final Map<String, Object> objectNodeProperties,
            final Optional<String> optionalDataModelURI);

    public abstract void handleSubjectDataModel(final Map<String, Object> subjectNodeProperties, String URI,
            final Optional<String> optionalDataModelURI);

    public abstract void addStatementToIndex(final long relId, final String statementUUID);

    public abstract Optional<Long> getResourceNodeHits(final String resourceURI);

    protected BatchInserterIndex getOrCreateIndex(final String name, final String property, final boolean nodeIndex,
            final int cachSize) {

        final BatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(inserter);
        final BatchInserterIndex index;

        if (nodeIndex) {

            index = indexProvider.nodeIndex(name, MapUtil.stringMap("type", "exact"));
        } else {

            index = indexProvider.relationshipIndex(name, MapUtil.stringMap("type", "exact"));
        }

        index.setCacheCapacity(property, cachSize);

        return index;
    }

    private Object getProperty(final String key, final Map<String, Object> properties) {

        if (properties == null || properties.isEmpty()) {

            return null;
        }

        if (!properties.containsKey(key)) {

            return null;
        }

        return properties.get(key);
    }

    private Optional<Long> getIdFromIndex(final String key, final ObjectLongOpenHashMap<String> tempIndex,
            final BatchInserterIndex index, final String indexProperty) {

        if (key == null) {

            return Optional.absent();
        }

        if (tempIndex.containsKey(key)) {

            return Optional.of(tempIndex.lget());
        }

        if (index == null) {

            return Optional.absent();
        }

        final IndexHits<Long> hits = index.get(indexProperty, key);

        if (hits != null && hits.hasNext()) {

            final Long hit = hits.next();

            hits.close();

            final Optional<Long> optionalHit = Optional.fromNullable(hit);

            if (optionalHit.isPresent()) {

                // temp cache index hits again
                tempIndex.put(key, optionalHit.get());
            }

            return optionalHit;
        }

        if (hits != null) {

            hits.close();
        }

        return Optional.absent();
    }

    private Optional<Long> getIdFromLongIndex(final long key, final LongLongOpenHashMap tempIndex,
            final BatchInserterIndex index, final String indexProperty) {

        if (tempIndex.containsKey(key)) {

            return Optional.of(tempIndex.lget());
        }

        if (index == null) {

            return Optional.absent();
        }

        final IndexHits<Long> hits = index.get(indexProperty, key);

        if (hits != null && hits.hasNext()) {

            final Long hit = hits.next();

            hits.close();

            final Optional<Long> optionalHit = Optional.fromNullable(hit);

            if (optionalHit.isPresent()) {

                // temp cache index hits again
                tempIndex.put(key, optionalHit.get());
            }

            return optionalHit;
        }

        if (hits != null) {

            hits.close();
        }

        return Optional.absent();
    }
}