org.fcrepo.indexer.IndexerGroup.java Source code

Java tutorial

Introduction

Here is the source code for org.fcrepo.indexer.IndexerGroup.java

Source

/**
 * Copyright 2015 DuraSpace, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fcrepo.indexer;

import com.google.common.base.Strings;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Supplier;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.NodeIterator;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;

import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.impl.client.StandardHttpRequestRetryHandler;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.fcrepo.kernel.utils.EventType;
import org.slf4j.Logger;

import javax.jms.JMSException;
import javax.jms.Message;
import javax.jms.MessageListener;

import java.io.InputStream;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import static com.google.common.base.Suppliers.memoize;
import static com.google.common.base.Throwables.propagate;
import static com.hp.hpl.jena.rdf.model.ResourceFactory.createProperty;
import static com.hp.hpl.jena.rdf.model.ResourceFactory.createResource;
import static com.hp.hpl.jena.vocabulary.RDF.type;
import static java.lang.Integer.MAX_VALUE;
import static javax.jcr.observation.Event.NODE_REMOVED;
import static org.apache.commons.lang.StringUtils.isBlank;
import static org.fcrepo.kernel.FedoraJcrTypes.FCR_METADATA;
import static org.fcrepo.jms.headers.DefaultMessageFactory.JMS_NAMESPACE;
import static org.fcrepo.kernel.RdfLexicon.CONTAINS;
import static org.fcrepo.kernel.RdfLexicon.HAS_PARENT;
import static org.fcrepo.kernel.RdfLexicon.REPOSITORY_NAMESPACE;
import static org.slf4j.LoggerFactory.getLogger;

/**
 * MessageListener implementation that retrieves objects from the repository and
 * invokes one or more indexers to index the content. documentation:
 * https://wiki.duraspace.org/display/FF/Design+-+Messaging+for+Workflow
 *
 * @author Esm Cowles
 * @author ajs6f
 * @since Aug 19 2013
 **/
public class IndexerGroup implements MessageListener {

    private static final Logger LOGGER = getLogger(IndexerGroup.class);

    @VisibleForTesting
    protected final Set<Indexer<Object>> indexers;

    private Set<URI> reindexed;

    /**
     * Identifier message header
     */
    static final String IDENTIFIER_HEADER_NAME = JMS_NAMESPACE + "identifier";

    /**
     * Properties message header
     */
    static final String PROPERTIES_HEADER_NAME = JMS_NAMESPACE + "properties";

    /**
     * BaseURL message header
     */
    static final String BASE_URL_HEADER_NAME = JMS_NAMESPACE + "baseURL";

    /**
     * Event type message header
     */
    static final String EVENT_TYPE_HEADER_NAME = JMS_NAMESPACE + "eventType";

    /**
     * Type of event that qualifies as a removal.
     */
    static final String REMOVAL_EVENT_TYPE = REPOSITORY_NAMESPACE + EventType.valueOf(NODE_REMOVED).toString();

    /**
     * Type of event to indicate reindexing.
     */
    private static final String REINDEX_EVENT_TYPE = REPOSITORY_NAMESPACE + "NODE_REINDEXED";

    public static final String INDEXER_NAMESPACE = "http://fedora.info/definitions/v4/indexing#";

    /**
     * Indicates the transformation to use with this resource to derive indexing
     * information.
     */
    public static final Property INDEXING_TRANSFORM_PREDICATE = createProperty(
            INDEXER_NAMESPACE + "hasIndexingTransformation");

    /**
     * Indicates that a resource is indexable.
     */
    public static final Resource INDEXABLE_MIXIN = createResource(INDEXER_NAMESPACE + "Indexable");

    private static final String REST_PREFIX = "/rest/";
    private static final String FCREPO_PREFIX = "/fcrepo/";

    /**
     * Indicates that a resource is a datastream.
    **/
    static final Resource DATASTREAM_TYPE = createResource(REPOSITORY_NAMESPACE + "NonRdfSourceDescription");

    private static final Reader EMPTY_CONTENT = null;

    private final String fedoraUsername;
    private final String fedoraPassword;
    private final Map<String, DefaultHttpClient> clients;
    private final DefaultHttpClient defaultClient;

    /**
     * Default constructor.
     * @param indexers the set of indexers
     * @param fedoraUsername the fedora user name
     * @param fedoraPassword the fedora password
     **/
    public IndexerGroup(final Set<Indexer<Object>> indexers, final String fedoraUsername,
            final String fedoraPassword) {
        this.fedoraUsername = fedoraUsername;
        this.fedoraPassword = fedoraPassword;

        LOGGER.debug("Creating IndexerGroup: {}", this);
        this.indexers = indexers;
        this.clients = new HashMap<>();
        this.defaultClient = null;
    }

    /**
     * Constructor with provided default HttpClient instance added for testing.
     * @param indexers the set of indexers
     * @param httpClient the http client for testing
    **/
    public IndexerGroup(final Set<Indexer<Object>> indexers, final DefaultHttpClient httpClient) {
        LOGGER.debug("Creating IndexerGroup: {}", this);
        this.indexers = indexers;
        this.clients = new HashMap<>();
        this.fedoraUsername = null;
        this.fedoraPassword = null;
        this.defaultClient = httpClient;
    }

    @VisibleForTesting
    protected DefaultHttpClient httpClient(final String repositoryURL) {
        // try to find existing client
        if (clients.size() > 0) {
            for (final Iterator<String> it = clients.keySet().iterator(); it.hasNext();) {
                final String base = it.next();
                if (repositoryURL.startsWith(base)) {
                    return clients.get(base);
                }
            }
        }

        if (defaultClient != null) {
            return defaultClient;
        }

        // if no existing client matched, create a new one
        final String baseURL;
        if (repositoryURL.indexOf(REST_PREFIX) > 0) {
            baseURL = repositoryURL.substring(0, repositoryURL.indexOf(REST_PREFIX) + REST_PREFIX.length());
        } else if (repositoryURL.indexOf("/", FCREPO_PREFIX.length()) > 0) {
            baseURL = repositoryURL.substring(0, repositoryURL.indexOf("/", FCREPO_PREFIX.length()) + 1);
        } else {
            baseURL = repositoryURL;
        }

        final PoolingClientConnectionManager connMann = new PoolingClientConnectionManager();
        connMann.setMaxTotal(MAX_VALUE);
        connMann.setDefaultMaxPerRoute(MAX_VALUE);

        final DefaultHttpClient httpClient = new DefaultHttpClient(connMann);
        httpClient.setRedirectStrategy(new DefaultRedirectStrategy());
        httpClient.setHttpRequestRetryHandler(new StandardHttpRequestRetryHandler(0, false));

        // If the Fedora instance requires authentication, set it up here
        if (!isBlank(fedoraUsername) && !isBlank(fedoraPassword)) {
            LOGGER.debug("Adding BASIC credentials to client for repo requests.");

            final URI fedoraUri = URI.create(baseURL);
            final CredentialsProvider credsProvider = new BasicCredentialsProvider();
            credsProvider.setCredentials(new AuthScope(fedoraUri.getHost(), fedoraUri.getPort()),
                    new UsernamePasswordCredentials(fedoraUsername, fedoraPassword));

            httpClient.setCredentialsProvider(credsProvider);
        }
        clients.put(baseURL, httpClient);
        return httpClient;
    }

    /**
     * Handle a JMS message representing an object update or deletion event.
     **/
    @Override
    public void onMessage(final Message message) {
        try {
            LOGGER.debug("Received message: {}", message.getJMSMessageID());
        } catch (final JMSException e) {
            LOGGER.error("Received unintelligible message: {}", e);
            propagate(e);
        }
        try {
            // get id and eventType from message
            final String eventType = message.getStringProperty(EVENT_TYPE_HEADER_NAME);
            final String id = message.getStringProperty(IDENTIFIER_HEADER_NAME);
            String baseURL = message.getStringProperty(BASE_URL_HEADER_NAME);

            LOGGER.debug("Discovered id: {} in message.", id);
            LOGGER.debug("Discovered event type: {} in message.", eventType);
            LOGGER.debug("Discovered baseURL: {} in message.", baseURL);
            LOGGER.debug("Discovered properties: {} in message.",
                    message.getStringProperty(PROPERTIES_HEADER_NAME));

            // Trim trailing '/'
            while (!Strings.isNullOrEmpty(baseURL) && baseURL.endsWith("/")) {
                baseURL = baseURL.substring(0, baseURL.length() - 1);
            }

            index(new URI(baseURL + id), eventType);
        } catch (final URISyntaxException e) {
            LOGGER.error("Error creating URI", e);
        } catch (final JMSException e) {
            LOGGER.error("Error processing JMS event!", e);
        }
    }

    /**
     * Index a resource.
    **/
    private void index(final URI uri, final String eventType) throws URISyntaxException {
        final Boolean removal = REMOVAL_EVENT_TYPE.equals(eventType);
        final HttpClient httpClient = httpClient(uri.toString());
        LOGGER.debug("It is {} that this is a removal operation.", removal);
        final Supplier<Model> rdfr = memoize(new RdfRetriever(uri, httpClient));
        final Supplier<NamedFields> nfr = memoize(new NamedFieldsRetriever(uri, httpClient, rdfr));
        final Supplier<InputStream> jcrfr = memoize(new JcrXmlRetriever(uri, httpClient));
        Boolean indexable = false;

        if (!removal) {
            final Model rdf = rdfr.get();
            if (rdf.contains(createResource(uri.toString()), type, INDEXABLE_MIXIN)
                    || rdf.contains(createResource(uri.toString() + "/" + FCR_METADATA), type, INDEXABLE_MIXIN)) {
                LOGGER.debug("Resource: {} retrieved with indexable type.", uri);
                indexable = true;
            } else {
                LOGGER.debug("Resource: {} retrieved without indexable type.", uri);
            }

            // if this is a datastream, also index the parent object
            final Resource subj = createResource(uri.toString());
            if (rdf.contains(subj, type, DATASTREAM_TYPE) && uri.toString().indexOf("/fedora:system/") == -1) {
                final NodeIterator parents = rdf.listObjectsOfProperty(subj, HAS_PARENT);
                if (parents.hasNext()) {
                    final String parent = parents.nextNode().asResource().getURI();
                    LOGGER.info("Datastream found, also indexing parent {}", parent);
                    index(new URI(parent), "NODE_UPDATED");
                }
            }
        }

        for (final Indexer<Object> indexer : indexers) {
            LOGGER.debug("Operating for indexer: {}", indexer);
            Boolean hasContent = false;
            Object content = EMPTY_CONTENT;
            if (!removal && indexable) {
                switch (indexer.getIndexerType()) {
                case NAMEDFIELDS:
                    LOGGER.debug("Retrieving named fields for: {}, (may be cached) to index to {}...", uri,
                            indexer);
                    try {
                        content = nfr.get();
                        hasContent = true;
                    } catch (final AbsentTransformPropertyException e) {
                        LOGGER.error(
                                "Failed to retrieve indexable content:" + "could not find transform property!");
                        hasContent = false;
                    }
                    break;
                case RDF:
                    LOGGER.debug("Retrieving RDF for: {}, (may be cached) to index to {}...", uri, indexer);
                    content = rdfr.get();
                    hasContent = true;
                    break;
                case JCRXML_PERSISTENCE:
                    LOGGER.debug("Retrieving jcr/xml for: {} and persist it to {}...", uri, indexer);
                    content = jcrfr.get();
                    hasContent = true;
                    break;
                default:
                    hasContent = true;
                    break;
                }
            }

            try {
                if (removal) {
                    LOGGER.debug("Executing removal of: {} to indexer: {}...", uri, indexer);
                    indexer.remove(uri);
                } else {
                    if (hasContent) {
                        LOGGER.debug("Executing update of: {} to indexer: {}...", uri, indexer);
                        indexer.update(uri, content);
                    } else if (indexable) {
                        LOGGER.error("Received update for: {} but was unable to retrieve "
                                + "content for update to indexer: {}!", uri, indexer);
                    }
                }
            } catch (final Exception e) {
                LOGGER.error("Error {} indexing {}: {}!", indexer.getClass().getName(), uri, e);
            }
        }
    }

    /**
     * Reindex a resource (and optionally all of its children).
     * @param uri The resource URI to reindex.
     * @param recursive If true, also recursively reindex all children.
     * @throws URISyntaxException if URI syntax exception occurred
    **/
    public void reindex(final URI uri, final boolean recursive) throws URISyntaxException {
        reindexed = new HashSet<>();
        reindexURI(uri, recursive);
    }

    private void reindexURI(final URI uri, final boolean recursive) throws URISyntaxException {
        LOGGER.debug("Reindexing {}, recursive: {}", uri, recursive);
        if (!reindexed.contains(uri)) {
            // index() will check for indexable mixin
            index(uri, REINDEX_EVENT_TYPE);
        }

        // prevent infinite recursion
        reindexed.add(uri);

        // check for children (rdf should be cached...)
        if (recursive) {
            final Supplier<Model> rdfr = memoize(new RdfRetriever(uri, httpClient(uri.toString())));
            final Model model = rdfr.get();
            final NodeIterator children = model.listObjectsOfProperty(CONTAINS);
            while (children.hasNext()) {
                final URI child = new URI(children.nextNode().asResource().getURI());
                if (!reindexed.contains(child)) {
                    reindexURI(child, true);
                }
            }
        }
    }
}