org.phenotips.vocabulary.internal.solr.OrphanetRareDiseaseOntology.java Source code

Java tutorial

Introduction

Here is the source code for org.phenotips.vocabulary.internal.solr.OrphanetRareDiseaseOntology.java

Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */
package org.phenotips.vocabulary.internal.solr;

import org.phenotips.vocabulary.VocabularyTerm;

import org.xwiki.component.annotation.Component;

import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.inject.Named;
import javax.inject.Singleton;

import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.SpellingParams;

import com.google.common.base.Optional;
import com.google.common.collect.ImmutableSet;
import com.hp.hpl.jena.ontology.IntersectionClass;
import com.hp.hpl.jena.ontology.OntClass;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.Restriction;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;

/**
 * Provides access to the Orphanet Rare Disease Ontology (ORDO) ontology. The ontology prefix is {@code ORDO}.
 *
 * @version $Id: a5061c88bc8d6a3fdec9ddd724f8fb805169d498 $
 * @since 1.3
 */
@Component
@Named("ordo")
@Singleton
public class OrphanetRareDiseaseOntology extends AbstractOWLSolrVocabulary {
    private static final String PHENOME_LABEL = "http://www.orpha.net/ORDO/Orphanet_C001";

    private static final String GENETIC_MATERIAL_LABEL = "http://www.orpha.net/ORDO/Orphanet_C010";

    private static final String HASDBXREF_LABEL = "hasDbXref";

    private static final String TERM_CATEGORY_LABEL = "term_category";

    private static final String IS_A_LABEL = "is_a";

    private static final String ON_PROPERTY_LABEL = "onProperty";

    /**
     * The pattern for prevalence values. Values are expected to be in fraction format, and may include "<" or ">" or
     * ranges (e.g. "1-9"), or single digits in the numerator.
     */
    private static final Pattern PREV_PATTERN = Pattern
            .compile("^>?<?\\s*([0-9]+)(?:[^0-9]+)?([0-9]+)?(\\s*/\\s*)([0-9\\s]+)");

    private Set<OntClass> hierarchyRoots;

    private String region = StringUtils.EMPTY;

    private boolean isIntersection;

    @Override
    public String getIdentifier() {
        return "ordo";
    }

    @Override
    public String getName() {
        return "Orphanet Rare Disease Ontology";
    }

    @Override
    protected String getCoreName() {
        return getIdentifier();
    }

    @Override
    public Set<String> getAliases() {
        final Set<String> aliases = new HashSet<>();
        aliases.add(getName());
        aliases.add(getIdentifier());
        aliases.add(getTermPrefix());
        aliases.add("ORPHA");
        return Collections.unmodifiableSet(aliases);
    }

    @Override
    int getSolrDocsPerBatch() {
        return 15000;
    }

    @Override
    String getBaseOntologyUri() {
        return "http://www.orpha.net/ontology/orphanet.owl";
    }

    @Override
    String getTermPrefix() {
        return "ORDO";
    }

    @Override
    public String getDefaultSourceLocation() {
        return "http://data.bioontology.org/ontologies/ORDO/submissions/10/download"
                + "?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb";
    }

    @Override
    public String getWebsite() {
        return "http://www.orpha.net/";
    }

    @Override
    public String getCitation() {
        return "Orphadata: Free access data from Orphanet.  INSERM 1997";
    }

    @Override
    Collection<OntClass> getRootClasses(@Nonnull final OntModel ontModel) {
        this.hierarchyRoots = ImmutableSet.<OntClass>builder().add(ontModel.getOntClass(PHENOME_LABEL))
                .add(ontModel.getOntClass(GENETIC_MATERIAL_LABEL)).build();

        final ImmutableSet.Builder<OntClass> selectedRoots = ImmutableSet.builder();
        for (final OntClass hierarchyRoot : this.hierarchyRoots) {
            selectedRoots.addAll(hierarchyRoot.listSubClasses(DIRECT));
        }
        return selectedRoots.build();
    }

    @Override
    void extractClassData(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass ontClass,
            @Nonnull final OntClass parent) {
        if (parent.isRestriction()) {
            extractRestrictionData(doc, parent);
        } else if (parent.isIntersectionClass()) {
            // For ORDO, an intersection class only contains one or several related restrictions.
            extractIntersectionData(doc, ontClass, parent);
        } else if (!parent.isAnon()) {
            // If not a restriction, nor an intersection class, then try to extract as a named class (if not anonymous).
            extractNamedClassData(doc, ontClass, parent);
        } else {
            this.logger.warn(
                    "Parent class {} of {} is an anonymous class that is neither restriction nor intersection",
                    parent.getId(), ontClass.getLocalName());
        }
    }

    /**
     * Extracts hierarchy data from the parent {@link OntClass} to ontClass {@link OntClass}. Updates the
     * {@link SolrInputDocument} for ontClass.
     *
     * @param doc the Solr input document
     * @param ontClass the ontology class
     * @param parent the parent of the ontology class
     */
    private void extractNamedClassData(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass ontClass,
            @Nonnull final OntClass parent) {
        // Note: in ORDO, a subclass cannot have parents from different top categories (e.g. phenome and geography).
        if (!this.hierarchyRoots.contains(parent) && !hasHierarchyRootAsParent(parent, DIRECT)) {
            // This will not be null, since only anonymous classes have no local name. This check is performed in
            // the calling method (extractClassData).
            final String ordoId = getFormattedOntClassId(parent.getLocalName());

            // All parents are added to "term_category".
            addMultivaluedField(doc, TERM_CATEGORY_LABEL, ordoId);

            // If parent is a direct super-class to ontClass, then want to also add the parent to the "is_a" category.
            if (ontClass.hasSuperClass(parent, DIRECT)) {
                addMultivaluedField(doc, IS_A_LABEL, ordoId);
            }
        }
    }

    /**
     * Extracts data from the parent of ontClass that is an {@link IntersectionClass}. Updates the
     * {@link SolrInputDocument} for ontClass.
     *
     * @param doc the Solr input document
     * @param parent the parent class that contains the intersection class data for the ontologyClass
     */
    private void extractIntersectionData(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass ontClass,
            @Nonnull final OntClass parent) {
        this.isIntersection = true;
        final IntersectionClass intersection = parent.asIntersectionClass();
        final ExtendedIterator<? extends OntClass> operands = intersection.listOperands();

        while (operands.hasNext()) {
            final OntClass operand = operands.next();
            // For ORDO, there should only be restrictions in intersection classes.
            extractClassData(doc, ontClass, operand);
        }
        this.region = StringUtils.EMPTY;
        this.isIntersection = false;
        operands.close();
    }

    /**
     * Extracts data from the parent of ontClass that is a {@link Restriction} and updates the{@link SolrInputDocument}
     * for ontClass.
     *
     * @param doc the Solr input document
     * @param parent the parent class that contains restriction data for the ontologyClass
     */
    private void extractRestrictionData(@Nonnull final SolrInputDocument doc, @Nonnull final OntClass parent) {
        final Restriction restriction = parent.asRestriction();

        // Restrictions can be someValuesFrom, hasValue, allValuesFrom, etc. ORDO appears to only use the first two.
        if (restriction.isSomeValuesFromRestriction()) {
            extractSomeValuesFromRestriction(doc, restriction);
        } else if (restriction.isHasValueRestriction()) {
            extractHasValueRestriction(doc, restriction);
        } else {
            this.logger.warn("Restriction {} in class {} is neither someValuesFrom nor hasValue type.",
                    restriction.getId(), doc.getFieldValue(ID_FIELD_NAME));
        }
    }

    /**
     * Extracts data from the parent of ontClass that is a {@link Restriction} of type
     * {@link com.hp.hpl.jena.ontology.SomeValuesFromRestriction}. Updates the {@link SolrInputDocument} for ontClass.
     *
     * @param doc the input Solr document
     * @param restriction the restriction
     */
    private void extractSomeValuesFromRestriction(@Nonnull final SolrInputDocument doc,
            @Nonnull final Restriction restriction) {
        // someValuesFrom restrictions refer to the other "modifier" classes such as inheritance, geography, etc.
        // If a disease is part of a group of disorders, it will also be indicated here under a "part_of" property.
        final String fieldName = getOnPropertyFromRestriction(restriction);
        final String fieldValue = getSomeValuesFromRestriction(restriction);

        if (StringUtils.isNotBlank(fieldName) && StringUtils.isNotBlank(fieldValue)) {
            if ("present_in".equals(fieldName)) {
                this.region = fieldValue;
                addMultivaluedField(doc, fieldName, fieldValue);
                return;
            }
            if (!this.isIntersection) {
                addMultivaluedField(doc, fieldName, fieldValue);
            } else {
                writeWorldwideDataFromRestriction(doc, fieldName, fieldValue);
            }
        } else {
            this.logger.warn(
                    "Could not extract data from someValuesFrom restriction {}, onProperty {}, in class {}",
                    restriction.getId(), fieldName, doc.getFieldValue(ID_FIELD_NAME));
        }
    }

    /**
     * Extracts data from the parent of ontClass that is a {@link Restriction} of type
     * {@link com.hp.hpl.jena.ontology.HasValueRestriction}. Updates the {@link SolrInputDocument} for ontClass.
     *
     * @param doc the input Solr document
     * @param restriction the restriction
     */
    private void extractHasValueRestriction(@Nonnull final SolrInputDocument doc,
            @Nonnull final Restriction restriction) {
        // Not all of these have pretty names. Re-map these via managed-schema.xml field configurations.
        final String fieldName = getOnPropertyFromRestriction(restriction);
        final String fieldValue = restriction.asHasValueRestriction().getHasValue().asLiteral().getLexicalForm();
        if (StringUtils.isNotBlank(fieldName) && StringUtils.isNotBlank(fieldValue)) {
            if (!this.isIntersection) {
                addMultivaluedField(doc, fieldName, fieldValue);
            } else {
                writeWorldwideDataFromRestriction(doc, fieldName, fieldValue);
            }
        } else {
            this.logger.warn("Could not extract data from hasValue restriction {}, onProperty {}, in class {}",
                    restriction.getId(), fieldName, doc.getFieldValue(ID_FIELD_NAME));
        }
    }

    /**
     * A workaround to obtain the label for the onProperty field for a {@link Restriction}. Ideally, this should be done
     * by using the {@link Restriction#onProperty(Property)}, however for ORDO, the stored node cannot be converted into
     * an OntProperty class.
     *
     * @param restriction the restriction being examined
     * @return the onProperty label as a string
     */
    private String getOnPropertyFromRestriction(@Nonnull final Restriction restriction) {
        final ExtendedIterator<Statement> statements = restriction.listProperties();
        while (statements.hasNext()) {
            final Statement statement = statements.next();
            // Workaround for getting the property label.
            if (ON_PROPERTY_LABEL.equals(statement.getPredicate().getLocalName())) {
                final String onPropertyLink = statement.getObject().toString();
                return restriction.getOntModel().getOntResource(onPropertyLink).getLabel(null);
            }
        }
        statements.close();
        return null;
    }

    /**
     * Obtains the label for the {@link Restriction} of type {@link com.hp.hpl.jena.ontology.SomeValuesFromRestriction}.
     *
     * @param restriction the restriction being examined
     * @return the someValuesFrom restriction value as a string
     */
    private String getSomeValuesFromRestriction(@Nonnull final Restriction restriction) {
        final OntClass ontClass = restriction.asSomeValuesFromRestriction().getSomeValuesFrom().as(OntClass.class);
        return !hasHierarchyRootAsParent(ontClass, !DIRECT) ? ontClass.getLabel(null)
                : getFormattedOntClassId(ontClass.getLocalName());
    }

    /**
     * Returns true iff an object of {@link OntClass} has one of the hierarchy roots as a parent.
     *
     * @param ontClass the restriction class
     * @param level specifies the level to search: direct iff true, traverse entire tree otherwise
     * @return true iff the someValuesFrom restriction value should be stored as a name
     */
    private Boolean hasHierarchyRootAsParent(@Nonnull final OntClass ontClass, @Nonnull final Boolean level) {
        for (final OntClass hierarchyRoot : this.hierarchyRoots) {
            if (ontClass.hasSuperClass(hierarchyRoot, level)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Extracts property hasDbXref from an {@link RDFNode} and adds it to the {@link SolrInputDocument}.
     *
     * @param doc the Solr input document for an {@link OntClass} of interest
     * @param object the {@link RDFNode} object contained within the {@link OntClass} of interest
     */
    private void extractDbxRef(@Nonnull final SolrInputDocument doc, @Nonnull final RDFNode object) {
        // If the node is not a literal, will throw a {@link LiteralRequiredException}. For ORDO, this is always a
        // literal.
        if (object.isLiteral()) {
            final String externalRef = object.asLiteral().getLexicalForm();
            final String ontology = StringUtils.substringBefore(externalRef, SEPARATOR);
            final String externalId = StringUtils.substringAfter(externalRef, SEPARATOR);
            addMultivaluedField(doc, ontology.toLowerCase() + "_id", externalId);
        }
    }

    /**
     * Extracts the fieldName property from the provided {@link RDFNode}, and adds this data to the
     * {@link SolrInputDocument}.
     *
     * @param doc the Solr input document
     * @param fieldName the name of the field to be stored
     * @param object the {@link RDFNode} object
     */
    private void extractField(@Nonnull final SolrInputDocument doc, @Nonnull final String fieldName,
            @Nonnull final RDFNode object) {
        // If the node is not a literal, will throw a {@link LiteralRequiredException}, so need to check. Non literals
        // will be properties like Class or subClassOf. This kind of data is already added via parents.
        if (object.isLiteral()) {
            final String fieldValue = object.asLiteral().getLexicalForm();
            addMultivaluedField(doc, fieldName, fieldValue);
        }
    }

    /**
     * Adds field name and multi-valued field value to the {@link SolrInputDocument} iff this value isn't already stored
     * in given field.
     *
     * @param doc the Solr input document
     * @param fieldName the name of the field to be added
     * @param fieldValue the value of the field being added
     */
    private void addMultivaluedField(@Nonnull final SolrInputDocument doc, @Nonnull final String fieldName,
            @Nonnull final Object fieldValue) {
        if (!Optional.fromNullable(doc.getFieldValues(fieldName)).or(Collections.emptyList())
                .contains(fieldValue)) {
            doc.addField(fieldName, fieldValue);
        }
    }

    /**
     * Adds field name and single-valued value to the {@link SolrInputDocument} iff the value isn't already stored in
     * the given field.
     *
     * @param doc the Solr input document
     * @param fieldName the name of the field to be added
     * @param fieldValue the value of the field being added
     */
    private void addSingleValuedField(@Nonnull final SolrInputDocument doc, @Nonnull final String fieldName,
            @Nonnull final Object fieldValue) {
        if (!fieldValue.equals(doc.getFieldValue(fieldName))) {
            doc.addField(fieldName, fieldValue);
        }
    }

    /**
     * Writes {@code fieldName} and {@code fieldValue} to {@code doc} iff this is worldwide data.
     *
     * @param doc the {@link SolrInputDocument} being modified
     * @param fieldName the name of the field to be added
     * @param fieldValue the value of the field to be added
     */
    private void writeWorldwideDataFromRestriction(@Nonnull final SolrInputDocument doc,
            @Nonnull final String fieldName, @Nonnull final String fieldValue) {
        if ("Worldwide".equals(this.region)) {
            if ("has_point_prevalence_range".equals(fieldName) || "has_birth_prevalence_range".equals(fieldName)
                    || "has_lifetime_prevalence_range".equals(fieldName)) {
                addSingleValuedField(doc, fieldName + "_numeric", getNumericPrevalenceValue(fieldValue));
            }
        }

        final String regionInfo = StringUtils.isNotBlank(this.region) ? " (" + this.region + ")"
                : StringUtils.EMPTY;
        // Also add as is, with region data included.
        addMultivaluedField(doc, fieldName, fieldValue + regionInfo);
    }

    /**
     * Calculates numeric prevalence data based on a {@code fieldValue prevalence value} string.
     *
     * @param fieldValue the string containing the prevalence data range
     * @return the calculated prevalence, as double, -1 if an error occurred
     */
    private double getNumericPrevalenceValue(@Nonnull final String fieldValue) {
        try {
            final Matcher matcher = PREV_PATTERN.matcher(fieldValue);
            if (matcher.find()) {
                final double numerator = getNumerator(matcher.group(1), matcher.group(2));
                final double denominator = getDenominator(matcher.group(4));
                return numerator / denominator;
            }
        } catch (final Exception ex) {
            // Do nothing.
            this.logger.error("Regex matching failed: [{}]", ex.getMessage());
        }
        this.logger.error("The provided prevalence value: [{}] did not match the expected pattern.", fieldValue);
        return -1;
    }

    /**
     * Retrieves the denominator from {@code rawDenominatorStr} string, as double.
     *
     * @param rawDenominatorStr the denominator, as string
     * @return the denominator, as double
     */
    private double getDenominator(@Nonnull final String rawDenominatorStr) {
        final String denominatorStr = rawDenominatorStr.replaceAll("\\s*", StringUtils.EMPTY);
        return Double.parseDouble(denominatorStr);
    }

    /**
     * Calculates the numerator from {@code firstVal} and {@code secondVal}, expressed as strings.
     *
     * @param firstVal the first value in the numerator range
     * @param secondVal the second value in the numerator range; can be null if the numerator is not a range
     * @return the calculated numerator, as double
     */
    private double getNumerator(@Nonnull final String firstVal, @Nullable final String secondVal) {
        final double firstNum = Double.parseDouble(firstVal);
        if (StringUtils.isBlank(secondVal)) {
            return firstNum;
        }
        final double secondNum = Double.parseDouble(secondVal);
        return (firstNum + secondNum) / 2;
    }

    @Override
    void writeProperty(@Nonnull final SolrInputDocument doc, @Nonnull final String relation,
            @Nonnull final RDFNode object) {
        // hasDBXRef stores references to other databases (e.g. OMIM).
        if (HASDBXREF_LABEL.equals(relation)) {
            extractDbxRef(doc, object);
        } else {
            extractField(doc, relation, object);
        }
    }

    @Override
    String getFormattedOntClassId(@Nullable final String localName) {
        return StringUtils.isNotBlank(localName) ? localName.replace("Orphanet_", "ORDO:") : null;
    }

    @Override
    public List<VocabularyTerm> search(@Nullable final String input, final int maxResults,
            @Nullable final String sort, @Nullable final String customFilter) {
        return StringUtils.isBlank(input) ? Collections.<VocabularyTerm>emptyList()
                : searchMatches(input, maxResults, sort, customFilter);
    }

    /**
     * Searches the Solr index for matches to the input string.
     *
     * @param input string to match
     * @param maxResults the maximum number of results
     * @param sort the optional sort parameter
     * @param customFilter custom filter for results
     * @return a list of matching {@link VocabularyTerm} objects; empty if no suitable matches found
     */
    private List<VocabularyTerm> searchMatches(@Nonnull final String input, final int maxResults,
            @Nullable final String sort, @Nullable final String customFilter) {
        final SolrQuery query = new SolrQuery();
        addGlobalQueryParam(query);
        addFieldQueryParam(query);
        final List<SolrDocument> searchResults = search(
                addDynamicQueryParam(input, maxResults, sort, customFilter, query));
        final List<VocabularyTerm> results = new LinkedList<>();
        for (final SolrDocument doc : searchResults) {
            results.add(new SolrVocabularyTerm(doc, this));
        }
        return Collections.unmodifiableList(results);
    }

    /**
     * Adds dynamic solr query parameters to {@code query}, based on the received {@code rawQuery raw query string},
     * {@code rows the maximum number of results to return}, {@code sort the sorting order}, and {@code customFilter a
     * custom filter}.
     *
     * @param rawQuery unprocessed query string
     * @param rows the maximum number of search items to return
     * @param sort the optional sort parameter
     * @param customFilter custom filter for the results
     * @param query a {@link SolrQuery solr query} object
     * @return the updated {@link SolrQuery solr query} object
     */
    private SolrQuery addDynamicQueryParam(@Nonnull final String rawQuery, final Integer rows,
            @Nullable final String sort, @Nullable final String customFilter, @Nonnull SolrQuery query) {
        final String queryString = rawQuery.trim();
        final String escapedQuery = ClientUtils.escapeQueryChars(queryString);
        if (StringUtils.isNotBlank(customFilter)) {
            query.setFilterQueries(customFilter);
        }
        query.setQuery(escapedQuery);
        query.set(SpellingParams.SPELLCHECK_Q, queryString);
        final String lastWord = StringUtils.defaultIfBlank(StringUtils.substringAfterLast(escapedQuery, " "),
                escapedQuery) + "*";
        query.set(DisMaxParams.BQ,
                String.format("nameSpell:%1$s^20 defSpell:%1$s^3 text:%1$s^1 textSpell:%1$s^2", lastWord));
        query.setRows(rows);
        if (StringUtils.isNotBlank(sort)) {
            for (final String sortItem : sort.split("\\s*,\\s*")) {
                query.addSort(StringUtils.substringBefore(sortItem, " "),
                        sortItem.endsWith(" desc") || sortItem.startsWith("-") ? ORDER.desc : ORDER.asc);
            }
        }
        return query;
    }

    /**
     * Given a {@code query} object, adds global query parameters.
     *
     * @param query a {@link SolrQuery solr query} object
     * @return the {@code query} with global query parameters added
     */
    private SolrQuery addGlobalQueryParam(@Nonnull final SolrQuery query) {
        // Add global query parameters.
        query.set("spellcheck", Boolean.toString(true));
        query.set(SpellingParams.SPELLCHECK_COLLATE, Boolean.toString(true));
        query.set(SpellingParams.SPELLCHECK_COUNT, "100");
        query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "3");
        query.set("lowercaseOperators", Boolean.toString(false));
        query.set("defType", "edismax");
        return query;
    }

    /**
     * Given a {@code query} object, adds field query parameters.
     *
     * @param query a {@link SolrQuery solr query} object
     * @return the {@code query} with field parameters added
     */
    private SolrQuery addFieldQueryParam(@Nonnull final SolrQuery query) {
        query.set(DisMaxParams.PF,
                "name^20 nameSpell^36 nameExact^100 namePrefix^30 "
                        + "synonym^15 synonymSpell^25 synonymExact^70 synonymPrefix^20 "
                        + "def^7 defSpell^14 text^3 textSpell^5");
        query.set(DisMaxParams.QF, "id^100 name^10 nameSpell^18 nameStub^5 "
                + "synonym^6 synonymSpell^10 synonymStub^4 " + "def^3 defSpell^5 text^1 textSpell^2 textStub^0.5");
        return query;
    }
}