org.phenotips.vocabulary.internal.solr.AbstractOWLSolrVocabulary.java Source code

Introduction

Here is the source code for org.phenotips.vocabulary.internal.solr.AbstractOWLSolrVocabulary.java
Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */
package org.phenotips.vocabulary.internal.solr;

import org.phenotips.vocabulary.VocabularyExtension;
import org.phenotips.vocabulary.VocabularyTerm;

import java.io.IOException;
import java.util.Collection;
import java.util.List;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;

import com.hp.hpl.jena.ontology.OntClass;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.OntModelSpec;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;

/**
 * Ontologies processed from OWL files share much of the processing code.
 *
 * @version $Id: 658b7be057341253fd4c461e488be208d43ff12d $
 * @since 1.3
 */
public abstract class AbstractOWLSolrVocabulary extends AbstractSolrVocabulary {
    static final Boolean DIRECT = true;

    static final String SEPARATOR = ":";

    private static final String VERSION_FIELD_NAME = "version";

    private static final String TERM_GROUP_LABEL = "term_group";

    private static final String HEADER_INFO_LABEL = "HEADER_INFO";

    @Override
    public VocabularyTerm getTerm(@Nullable final String id) {
        return StringUtils.isNotBlank(id) ? getTerm(id, super.getTerm(id)) : null;
    }

    /**
     * Returns the result from the {@code firstAttempt first attempt} at search if not null, otherwise performs a search
     * for {@code id} without prefix (if the {@code id} has one).
     *
     * @param id the ID of the term of interest
     * @param firstAttempt the result of the first search attempt, can be null
     * @return the {@link VocabularyTerm} corresponding with the given ID, null if no such {@link VocabularyTerm} exists
     */
    private VocabularyTerm getTerm(@Nonnull final String id, @Nullable final VocabularyTerm firstAttempt) {
        return firstAttempt != null ? firstAttempt : searchTermWithoutPrefix(id);
    }

    /**
     * If the {@code id} stats with the optional prefix, removes the prefix and performs the search again.
     *
     * @param id the ID of the term of interest
     * @return the {@link VocabularyTerm} corresponding with the given ID, null if no such {@link VocabularyTerm} exists
     */
    private VocabularyTerm searchTermWithoutPrefix(@Nonnull final String id) {
        final String optPrefix = this.getTermPrefix() + SEPARATOR;
        return StringUtils.startsWith(id.toUpperCase(), optPrefix.toUpperCase())
                ? getTerm(StringUtils.substringAfter(id, SEPARATOR))
                : null;
    }

    /**
     * Delete all the data in the Solr index.
     *
     * @return {@code 0} if the command was successful, {@code 1} otherwise
     */
    protected int clear() {
        try {
            this.externalServicesAccess.getSolrConnection(getCoreName()).deleteByQuery("*:*");
            return 0;
        } catch (SolrServerException ex) {
            this.logger.error("SolrServerException while clearing the Solr index", ex);
        } catch (IOException ex) {
            this.logger.error("IOException while clearing the Solr index", ex);
        }
        return 1;
    }

    @Override
    public int reindex(@Nullable final String sourceUrl) {
        int retval;
        try {
            for (final VocabularyExtension ext : this.extensions.get()) {
                if (ext.isVocabularySupported(this)) {
                    ext.indexingStarted(this);
                }
            }
            this.clear();
            retval = this.index(sourceUrl);
        } finally {
            for (VocabularyExtension ext : this.extensions.get()) {
                if (ext.isVocabularySupported(this)) {
                    ext.indexingEnded(this);
                }
            }
        }
        return retval;
    }

    /**
     * Given a {@code sourceUrl source URL} for the vocabulary, return {@code 0} iff the vocabulary is indexed
     * successfully, {@code 1} otherwise.
     *
     * @param sourceUrl the source URL for the vocabulary, as string
     * @return {@code 0} iff the vocabulary is indexed successfully, {@code 1} otherwise
     */
    protected int index(@Nullable final String sourceUrl) {
        final String url = StringUtils.isNotBlank(sourceUrl) ? sourceUrl : getDefaultSourceLocation();
        // Fetch the ontology. If this is over the network, it may take a while.
        final OntModel ontModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM_TRANS_INF);
        ontModel.read(url);
        // Get the root classes of the ontology that we can start the parsing with.
        final Collection<OntClass> roots = getRootClasses(ontModel);
        // Reusing doc for speed (see http://wiki.apache.org/lucene-java/ImproveIndexingSpeed).
        final SolrInputDocument doc = new SolrInputDocument();
        try {
            // Set the ontology model version.
            setVersion(doc, ontModel);
            // Create and add solr documents for each of the roots.
            for (final OntClass root : roots) {
                // Don't want to add Solr documents for general root categories, so start adding children.
                addChildDocs(doc, root);
            }
            commitDocs();
            return 0;
        } catch (SolrServerException ex) {
            this.logger.warn("Failed to index ontology: {}", ex.getMessage());
        } catch (IOException ex) {
            this.logger.warn("Failed to communicate with the Solr server while indexing ontology: {}",
                    ex.getMessage());
        } catch (OutOfMemoryError ex) {
            this.logger.warn("Failed to add terms to the Solr. Ran out of memory. {}", ex.getMessage());
        }
        return 1;
    }

    /**
     * Create a document for the ontology class, and add it to the index.
     *
     * @param doc the reusable Solr input document
     * @param ontClass the ontology class that should be parsed
     * @param root the top root category for ontClass
     */
    private void addDoc(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass ontClass,
            @Nonnull final OntClass root) throws IOException, SolrServerException {
        parseSolrDocumentFromOntClass(doc, ontClass, root);
        parseSolrDocumentFromOntParentClasses(doc, ontClass);
        extendTerm(new SolrVocabularyInputTerm(doc, this));
        this.externalServicesAccess.getSolrConnection(getCoreName()).add(doc);
        doc.clear();
    }

    /**
     * Adds any of the sub-documents of the specified ontology class.
     *
     * @param doc the reusable Solr input document
     * @param ontClass the ontology class that should be parsed
     */
    private void addChildDocs(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass ontClass)
            throws IOException, SolrServerException {
        // Get all the subclasses of ontClass, and add a Solr document for each of them.
        final ExtendedIterator<OntClass> subClasses = ontClass.listSubClasses();
        int counter = 0;
        while (subClasses.hasNext()) {
            if (counter == getSolrDocsPerBatch()) {
                commitDocs();
                counter = 0;
            }
            final OntClass subClass = subClasses.next();
            addDoc(doc, subClass, ontClass);
            counter++;
        }
        subClasses.close();
    }

    /**
     * Commits the batch of newly-processed documents.
     */
    private void commitDocs() throws IOException, SolrServerException {
        this.externalServicesAccess.getSolrConnection(getCoreName()).commit();
        this.externalServicesAccess.getTermCache(getCoreName()).removeAll();
    }

    @Override
    public String getVersion() {
        final SolrQuery query = new SolrQuery();
        query.setQuery("version:*");
        query.set(CommonParams.ROWS, "1");
        try {
            final QueryResponse response = this.externalServicesAccess.getSolrConnection(getCoreName())
                    .query(query);
            final SolrDocumentList termList = response.getResults();

            if (!termList.isEmpty()) {
                final SolrDocument firstDoc = termList.get(0);
                return firstDoc.getFieldValue(VERSION_FIELD_NAME).toString();
            }
        } catch (SolrServerException | SolrException | IOException ex) {
            this.logger.warn("Failed to query ontology version: {}", ex.getMessage());
        }
        return null;
    }

    /**
     * Sets the ontology version data.
     *
     * @param doc the Solr input document
     * @param ontModel the ontology model
     * @throws IOException if failed to communicate with Solr server while indexing ontology
     * @throws SolrServerException if failed to index ontology
     */
    private void setVersion(@Nonnull final SolrInputDocument doc, @Nonnull final OntModel ontModel)
            throws IOException, SolrServerException {
        final String version = ontModel.getOntology(getBaseOntologyUri()).getVersionInfo();
        if (StringUtils.isNotBlank(version)) {
            doc.addField(ID_FIELD_NAME, HEADER_INFO_LABEL);
            doc.addField(VERSION_FIELD_NAME, version);
            this.externalServicesAccess.getSolrConnection(getCoreName()).add(doc);
            doc.clear();
        }
    }

    /**
     * Creates a Solr document from the provided ontology class.
     *
     * @param doc Solr input document
     * @param ontClass the ontology class
     * @param root the top root category for ontClass
     */
    private void parseSolrDocumentFromOntClass(@Nonnull final SolrInputDocument doc,
            @Nonnull final OntClass ontClass, @Nonnull final OntClass root) {
        doc.addField(ID_FIELD_NAME, getFormattedOntClassId(ontClass.getLocalName()));
        doc.addField(TERM_GROUP_LABEL, root.getLabel(null));
        extractProperties(doc, ontClass);
    }

    /**
     * Adds parent data for provided ontology class to the Solr document.
     *
     * @param doc Solr input document
     * @param ontClass the ontology class
     */
    private void parseSolrDocumentFromOntParentClasses(@Nonnull final SolrInputDocument doc,
            @Nonnull final OntClass ontClass) {
        // This will list all superclasses for ontClass.
        final ExtendedIterator<OntClass> allParents = ontClass.listSuperClasses(!DIRECT);
        // For anonymous classes, we're only interested in the direct parents.
        final List<OntClass> directParents = ontClass.listSuperClasses(DIRECT).toList();
        while (allParents.hasNext()) {
            final OntClass parent = allParents.next();
            // We're interested in all non-anonymous parents (these are parent disorders), but only the direct anonymous
            // parents (these are the class properties).
            if (!parent.isAnon() || directParents.contains(parent)) {
                extractClassData(doc, ontClass, parent);
            }
        }
        allParents.close();
    }

    /**
     * Extracts properties from the ontology class, and adds the data to the Solr input document.
     *
     * @param doc the Solr input document
     * @param ontClass the ontology class
     */
    private void extractProperties(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass ontClass) {
        final ExtendedIterator<Statement> statements = ontClass.listProperties();
        while (statements.hasNext()) {
            final Statement statement = statements.next();

            final RDFNode object = statement.getObject();
            final String relation = statement.getPredicate().getLocalName();

            writeProperty(doc, relation, object);
        }
        statements.close();
    }

    /**
     * Returns a prefix for the vocabulary term (e.g. ORDO, HPO).
     *
     * @return the prefix for the vocabulary term, as string
     */
    abstract String getTermPrefix();

    /**
     * Extracts relevant data from the the parent class of ontClass, and writes it to the Solr input document associated
     * with ontClass.
     *
     * @param doc the Solr input document
     * @param ontClass the ontology class of interest
     * @param parent the parent of ontClass
     */
    abstract void extractClassData(@Nonnull SolrInputDocument doc, @Nonnull OntClass ontClass,
            @Nonnull OntClass parent);

    /**
     * Get a numerical id string from a localName. Assuming the localName is in the form "Orphanet_XXX". If localName is
     * an empty string or is null, will return null.
     *
     * @param localName the localName of an OWL class if localName is not null or empty, null otherwise.
     * @return the string id.
     */
    abstract String getFormattedOntClassId(@Nullable String localName);

    /**
     * Adds the property value to the Solr input document, if it is an item of interest.
     *
     * @param doc the Solr input document
     * @param relation property name
     * @param object the rdf data node
     */
    abstract void writeProperty(@Nonnull SolrInputDocument doc, @Nonnull String relation, @Nonnull RDFNode object);

    /**
     * Get a collection of root classes from the provided ontology model.
     *
     * @param ontModel the provided ontology model
     * @return a collection of root classes
     */
    abstract Collection<OntClass> getRootClasses(@Nonnull OntModel ontModel);

    /**
     * The number of documents to be added and committed to Solr at a time.
     *
     * @return the number of documents as an integer
     */
    abstract int getSolrDocsPerBatch();

    /**
     * Retrieves the base URI for the ontology.
     *
     * @return the base URI of the ontology, as string
     */
    abstract String getBaseOntologyUri();
}