org.apache.rya.indexing.pcj.storage.mongo.MongoPcjDocuments.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.rya.indexing.pcj.storage.mongo.MongoPcjDocuments.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.rya.indexing.pcj.storage.mongo;

import static com.google.common.base.Preconditions.checkNotNull;
import static java.util.Objects.requireNonNull;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.rya.api.domain.RyaType;
import org.apache.rya.api.model.VisibilityBindingSet;
import org.apache.rya.api.resolver.RdfToRyaConversions;
import org.apache.rya.api.resolver.RyaToRdfConversions;
import org.apache.rya.api.utils.CloseableIterator;
import org.apache.rya.indexing.pcj.storage.PcjMetadata;
import org.apache.rya.indexing.pcj.storage.PrecomputedJoinStorage.PCJStorageException;
import org.apache.rya.indexing.pcj.storage.accumulo.PcjVarOrderFactory;
import org.apache.rya.indexing.pcj.storage.accumulo.ShiftVarOrderFactory;
import org.apache.rya.indexing.pcj.storage.accumulo.VariableOrder;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.query.BindingSet;
import org.eclipse.rdf4j.query.MalformedQueryException;
import org.eclipse.rdf4j.query.QueryEvaluationException;
import org.eclipse.rdf4j.query.QueryLanguage;
import org.eclipse.rdf4j.query.TupleQuery;
import org.eclipse.rdf4j.query.TupleQueryResult;
import org.eclipse.rdf4j.query.impl.MapBindingSet;
import org.eclipse.rdf4j.repository.RepositoryConnection;
import org.eclipse.rdf4j.repository.RepositoryException;

import com.mongodb.MongoClient;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.util.JSON;

/**
 * Creates and modifies PCJs in MongoDB. PCJ's are stored as follows:
 *
 * <pre>
 * <code>
 * ----- PCJ Metadata Doc -----
 * {
 *   _id: [pcj_ID]_METADATA,
 *   sparql: [sparql query to match results],
 *   varOrders: [varOrder1, VarOrder2, ..., VarOrdern]
 *   cardinality: [number of results]
 * }
 *
 * ----- PCJ Results Doc -----
 * {
 *   pcjId: [pcj_ID],
 *   visibilities: [visibilities]
 *   [binding_var1]: {
 *     uri: [type_uri],
 *     value: [value]
 *   }
 *   .
 *   .
 *   .
 *   [binding_varn]: {
 *     uri: [type_uri],
 *     value: [value]
 *   }
 * }
 * </code>
 * </pre>
 */
public class MongoPcjDocuments {
    public static final String PCJ_COLLECTION_NAME = "pcjs";

    // metadata fields
    public static final String CARDINALITY_FIELD = "cardinality";
    public static final String SPARQL_FIELD = "sparql";
    public static final String PCJ_METADATA_ID = "_id";
    public static final String VAR_ORDER_FIELD = "varOrders";

    // pcj results fields
    private static final String BINDING_VALUE = "value";
    private static final String BINDING_TYPE = "rdfType";
    private static final String VISIBILITIES_FIELD = "visibilities";
    private static final String PCJ_ID = "pcjId";

    private final MongoCollection<Document> pcjCollection;
    private static final PcjVarOrderFactory pcjVarOrderFactory = new ShiftVarOrderFactory();
    private static final ValueFactory VF = SimpleValueFactory.getInstance();

    /**
     * Creates a new {@link MongoPcjDocuments}.
     * @param client - The {@link MongoClient} to use to connect to mongo.
     * @param ryaInstanceName - The rya instance to connect to.
     */
    public MongoPcjDocuments(final MongoClient client, final String ryaInstanceName) {
        requireNonNull(client);
        requireNonNull(ryaInstanceName);
        pcjCollection = client.getDatabase(ryaInstanceName).getCollection(PCJ_COLLECTION_NAME);
    }

    private String makeMetadataID(final String pcjId) {
        return pcjId + "_METADATA";
    }

    /**
     * Creates a {@link Document} containing the metadata defining the PCj.
     *
     * @param pcjId - Uniquely identifies a PCJ within Rya. (not null)
     * @param sparql - The sparql query the PCJ will use.
     * @return The document built around the provided metadata.
     * @throws PCJStorageException - Thrown when the sparql query is malformed.
     */
    public Document makeMetadataDocument(final String pcjId, final String sparql) throws PCJStorageException {
        requireNonNull(pcjId);
        requireNonNull(sparql);

        final Set<VariableOrder> varOrders;
        try {
            varOrders = pcjVarOrderFactory.makeVarOrders(sparql);
        } catch (final MalformedQueryException e) {
            throw new PCJStorageException("Can not create the PCJ. The SPARQL is malformed.", e);
        }

        return new Document().append(PCJ_METADATA_ID, makeMetadataID(pcjId)).append(SPARQL_FIELD, sparql)
                .append(CARDINALITY_FIELD, 0).append(VAR_ORDER_FIELD, varOrders);

    }

    /**
     * Creates a new PCJ based on the provided metadata. The initial pcj results
     * will be empty.
     *
     * @param pcjId - Uniquely identifies a PCJ within Rya.
     * @param sparql - The query the pcj is assigned to.
     * @throws PCJStorageException - Thrown when the sparql query is malformed.
     */
    public void createPcj(final String pcjId, final String sparql) throws PCJStorageException {
        pcjCollection.insertOne(makeMetadataDocument(pcjId, sparql));
    }

    /**
     * Creates a new PCJ document and populates it by scanning an instance of
     * Rya for historic matches.
     * <p>
     * If any portion of this operation fails along the way, the partially
     * create PCJ documents will be left in Mongo.
     *
     * @param ryaConn - Connects to the Rya that will be scanned. (not null)
     * @param pcjId - Uniquely identifies a PCJ within Rya. (not null)
     * @param sparql - The SPARQL query whose results will be loaded into the PCJ results document. (not null)
     * @throws PCJStorageException The PCJ documents could not be create or the
     *     values from Rya were not able to be loaded into it.
     */
    public void createAndPopulatePcj(final RepositoryConnection ryaConn, final String pcjId, final String sparql)
            throws PCJStorageException {
        checkNotNull(ryaConn);
        checkNotNull(pcjId);
        checkNotNull(sparql);

        // Create the PCJ document in Mongo.
        createPcj(pcjId, sparql);

        // Load historic matches from Rya into the PCJ results document.
        populatePcj(pcjId, ryaConn);
    }

    /**
     * Gets the {@link PcjMetadata} from a provided PCJ Id.
     *
     * @param pcjId - The Id of the PCJ to get from MongoDB. (not null)
     * @return - The {@link PcjMetadata} of the Pcj specified.
     * @throws PCJStorageException The PCJ metadata document does not exist.
     */
    public PcjMetadata getPcjMetadata(final String pcjId) throws PCJStorageException {
        requireNonNull(pcjId);

        // since query by ID, there will only be one.
        final Document result = pcjCollection.find(new Document(PCJ_METADATA_ID, makeMetadataID(pcjId))).first();

        if (result == null) {
            throw new PCJStorageException("The PCJ: " + pcjId + " does not exist.");
        }

        final String sparql = result.getString(SPARQL_FIELD);
        final int cardinality = result.getInteger(CARDINALITY_FIELD, 0);
        final List<List<String>> varOrders = (List<List<String>>) result.get(VAR_ORDER_FIELD);
        final Set<VariableOrder> varOrder = new HashSet<>();
        for (final List<String> vars : varOrders) {
            varOrder.add(new VariableOrder(vars));
        }

        return new PcjMetadata(sparql, cardinality, varOrder);
    }

    /**
     * Adds binding set results to a specific PCJ.
     *
     * @param pcjId - Uniquely identifies a PCJ within Rya. (not null)
     * @param results - The binding set results. (not null)
     */
    public void addResults(final String pcjId, final Collection<VisibilityBindingSet> results) {
        checkNotNull(pcjId);
        checkNotNull(results);

        final List<Document> pcjDocs = new ArrayList<>();
        for (final VisibilityBindingSet vbs : results) {
            // each binding gets it's own doc.
            final Document bindingDoc = new Document(PCJ_ID, pcjId);
            vbs.forEach(binding -> {
                final RyaType type = RdfToRyaConversions.convertValue(binding.getValue());
                bindingDoc.append(binding.getName(),
                        new Document().append(BINDING_TYPE, type.getDataType().stringValue()).append(BINDING_VALUE,
                                type.getData()));
            });
            bindingDoc.append(VISIBILITIES_FIELD, vbs.getVisibility());
            pcjDocs.add(bindingDoc);
        }
        pcjCollection.insertMany(pcjDocs);

        // update cardinality in the metadata doc.
        final int appendCardinality = pcjDocs.size();
        final Bson query = new Document(PCJ_METADATA_ID, makeMetadataID(pcjId));
        final Bson update = new Document("$inc", new Document(CARDINALITY_FIELD, appendCardinality));
        pcjCollection.updateOne(query, update);
    }

    /**
     * Purges all results from the PCJ results document with the provided Id.
     *
     * @param pcjId - The Id of the PCJ to purge. (not null)
     */
    public void purgePcjs(final String pcjId) {
        requireNonNull(pcjId);

        // remove every doc for the pcj, except the metadata
        final Bson filter = new Document(PCJ_ID, pcjId);
        pcjCollection.deleteMany(filter);

        // reset cardinality
        final Bson query = new Document(PCJ_METADATA_ID, makeMetadataID(pcjId));
        final Bson update = new Document("$set", new Document(CARDINALITY_FIELD, 0));
        pcjCollection.updateOne(query, update);
    }

    /**
     * Scan Rya for results that solve the PCJ's query and store them in the PCJ
     * document.
     * <p>
     * This method assumes the PCJ document has already been created.
     *
     * @param pcjId - The Id of the PCJ that will receive the results. (not null)
     * @param ryaConn - A connection to the Rya store that will be queried to find results. (not null)
     * @throws PCJStorageException If results could not be written to the PCJ results document,
     *  the PCJ results document does not exist, or the query that is being execute was malformed.
     */
    public void populatePcj(final String pcjId, final RepositoryConnection ryaConn) throws PCJStorageException {
        checkNotNull(pcjId);
        checkNotNull(ryaConn);

        try {
            // Fetch the query that needs to be executed from the PCJ metadata document.
            final PcjMetadata pcjMetadata = getPcjMetadata(pcjId);
            final String sparql = pcjMetadata.getSparql();

            // Query Rya for results to the SPARQL query.
            final TupleQuery query = ryaConn.prepareTupleQuery(QueryLanguage.SPARQL, sparql);
            final TupleQueryResult results = query.evaluate();

            // Load batches of 1000 of them at a time into the PCJ results document.
            final Set<VisibilityBindingSet> batch = new HashSet<>(1000);
            while (results.hasNext()) {
                final VisibilityBindingSet bs = new VisibilityBindingSet(results.next());
                batch.add(bs);
                if (batch.size() == 1000) {
                    addResults(pcjId, batch);
                    batch.clear();
                }
            }

            if (!batch.isEmpty()) {
                addResults(pcjId, batch);
            }

        } catch (RepositoryException | MalformedQueryException | QueryEvaluationException e) {
            throw new PCJStorageException(
                    "Could not populate a PCJ document with Rya results for the pcj with Id: " + pcjId, e);
        }
    }

    /**
     * List the document Ids of the PCJs that are stored in MongoDB
     * for this instance of Rya.
     *
     * @return A list of pcj document Ids that hold PCJ index data for the current
     *   instance of Rya.
     */
    public List<String> listPcjDocuments() {
        final List<String> pcjIds = new ArrayList<>();

        //This Bson string reads as:
        //{} - no search criteria: find all
        //{ _id: 1 } - only return the _id, which is the PCJ Id.
        final FindIterable<Document> rez = pcjCollection
                .find((Bson) JSON.parse("{ }, { " + PCJ_METADATA_ID + ": 1 , _id: 0}"));
        final Iterator<Document> iter = rez.iterator();
        while (iter.hasNext()) {
            pcjIds.add(iter.next().get(PCJ_METADATA_ID).toString().replace("_METADATA", ""));
        }

        return pcjIds;
    }

    /**
     * Returns all of the results of a PCJ.
     *
     * @param pcjId
     *            - The PCJ to get the results for. (not null)
     * @return The authorized PCJ results.
     */
    public CloseableIterator<BindingSet> listResults(final String pcjId) {
        requireNonNull(pcjId);

        // get all results based on pcjId
        return queryForBindings(new Document(PCJ_ID, pcjId));
    }

    /**
     * Retrieves the stored {@link BindingSet} results for the provided pcjId.
     *
     * @param pcjId - The Id of the PCJ to retrieve results from.
     * @param restrictionBindings - The collection of {@link BindingSet}s to restrict results.
     * <p>
     * Note: the result restrictions from {@link BindingSet}s are an OR
     * over ANDS in that: <code>
     *  [
     *     bindingset: binding AND binding AND binding,
     *     OR
     *     bindingset: binding AND binding AND binding,
     *     .
     *     .
     *     .
     *     OR
     *     bindingset: binding
     *  ]
     * </code>
     * @return
     */
    public CloseableIterator<BindingSet> getResults(final String pcjId,
            final Collection<BindingSet> restrictionBindings) {
        // empty bindings return all results.
        if (restrictionBindings.size() == 1 && restrictionBindings.iterator().next().size() == 0) {
            return listResults(pcjId);
        }

        final Document query = new Document(PCJ_ID, pcjId);
        final Document bindingSetDoc = new Document();
        final List<Document> bindingSetList = new ArrayList<>();
        restrictionBindings.forEach(bindingSet -> {
            final Document bindingDoc = new Document();
            final List<Document> bindings = new ArrayList<>();
            bindingSet.forEach(binding -> {
                final RyaType type = RdfToRyaConversions.convertValue(binding.getValue());
                final Document typeDoc = new Document().append(BINDING_TYPE, type.getDataType().stringValue())
                        .append(BINDING_VALUE, type.getData());
                final Document bind = new Document(binding.getName(), typeDoc);
                bindings.add(bind);
            });
            bindingDoc.append("$and", bindings);
            bindingSetList.add(bindingDoc);
        });
        bindingSetDoc.append("$or", bindingSetList);
        return queryForBindings(query);
    }

    private CloseableIterator<BindingSet> queryForBindings(final Document query) {
        final FindIterable<Document> rez = pcjCollection.find(query);
        final Iterator<Document> resultsIter = rez.iterator();
        return new CloseableIterator<BindingSet>() {
            @Override
            public boolean hasNext() {
                return resultsIter.hasNext();
            }

            @Override
            public BindingSet next() {
                final Document bs = resultsIter.next();
                final MapBindingSet binding = new MapBindingSet();
                for (final String key : bs.keySet()) {
                    if (key.equals(VISIBILITIES_FIELD)) {
                        // has auths, is a visibility binding set.
                    } else if (!key.equals("_id") && !key.equals(PCJ_ID)) {
                        // is the binding value.
                        final Document typeDoc = (Document) bs.get(key);
                        final IRI dataType = VF.createIRI(typeDoc.getString(BINDING_TYPE));
                        final RyaType type = new RyaType(dataType, typeDoc.getString(BINDING_VALUE));
                        final Value value = RyaToRdfConversions.convertValue(type);
                        binding.addBinding(key, value);
                    }
                }
                return binding;
            }

            @Override
            public void close() throws Exception {
            }
        };
    }

    /**
     * Drops a pcj based on the PCJ Id. Removing the entire document from Mongo.
     *
     * @param pcjId - The identifier for the PCJ to remove.
     */
    public void dropPcj(final String pcjId) {
        purgePcjs(pcjId);
        pcjCollection.deleteOne(new Document(PCJ_METADATA_ID, makeMetadataID(pcjId)));
    }
}