org.phenotips.vocabulary.internal.solr.MendelianInheritanceInMan.java Source code

Introduction

Here is the source code for org.phenotips.vocabulary.internal.solr.MendelianInheritanceInMan.java
Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */
package org.phenotips.vocabulary.internal.solr;

import org.phenotips.vocabulary.Vocabulary;
import org.phenotips.vocabulary.VocabularyTerm;

import org.xwiki.component.annotation.Component;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.SpellingParams;
import org.joda.time.DateTime;
import org.joda.time.format.ISODateTimeFormat;

/**
 * Provides access to the Online Mendelian Inheritance in Man (OMIM) vocabulary. The vocabulary prefix is {@code MIM}.
 *
 * @version $Id: 05f71e368d34d6a07c4ba79086a4157bc1f9b619 $
 * @since 1.0M8
 */
@Component
@Named("omim")
@Singleton
public class MendelianInheritanceInMan extends AbstractCSVSolrVocabulary {
    /** The standard name of this vocabulary, used as a term prefix. */
    public static final String STANDARD_NAME = "MIM";

    /** The location for the official OMIM source. */
    public static final String OMIM_SOURCE_URL = "http://data.omim.org/downloads/???/mimTitles.txt";

    private static final String GENE_ANNOTATIONS_URL = "http://omim.org/static/omim/data/mim2gene.txt";

    private static final String GENEREVIEWS_MAPPING_URL = "ftp://ftp.ncbi.nih.gov/pub/GeneReviews/NBKid_shortname_OMIM.txt";

    private static final String ENCODING = "UTF-8";

    private static final String ID_FIELD = "id";

    private static final String SYMBOL_FIELD = "symbol";

    private static final String TYPE_FIELD = "type";

    private static final String NAME_FIELD = "name";

    private static final String SHORT_NAME_FIELD = "short_name";

    private static final String SYNONYM_FIELD = "synonym";

    private static final String INCLUDED_NAME_FIELD = "included_name";

    private static final String GENE_FIELD = "GENE";

    private static final String TITLE_SEPARATOR = ";";

    private static final String LIST_SEPARATOR = ";;";

    /** The map of symbols preceding a MIM number to their symbolic representations. */
    private static final Map<String, String> SYMBOLS;

    /** The map of symbols preceding a MIM number to their corresponding types. */
    private static final Map<String, String[]> TYPES;

    private Map<String, SolrInputDocument> data = new HashMap<>();

    static {
        Map<String, String> symbols = new HashMap<>();
        Map<String, String[]> types = new HashMap<>();
        symbols.put("NULL", "");
        types.put("NULL", new String[] { "disorder" });

        symbols.put("Asterisk", "*");
        types.put("Asterisk", new String[] { "gene" });

        symbols.put("Number Sign", "#");
        types.put("Number Sign", new String[] { "disorder" });

        symbols.put("Plus", "+");
        types.put("Plus", new String[] { "gene", "disorder" });

        symbols.put("Percent", "%");
        types.put("Percent", new String[] { "disorder" });

        SYMBOLS = Collections.unmodifiableMap(symbols);
        TYPES = Collections.unmodifiableMap(types);
    }

    @Inject
    @Named("hpo")
    private Vocabulary hpo;

    @Override
    public String getDefaultSourceLocation() {
        return OMIM_SOURCE_URL;
    }

    @Override
    protected int getSolrDocsPerBatch() {
        return 500000;
    }

    @Override
    protected String getCoreName() {
        return getIdentifier();
    }

    @Override
    public String getIdentifier() {
        return "omim";
    }

    @Override
    public String getName() {
        return "Online Mendelian Inheritance in Man (OMIM)";
    }

    @Override
    public Set<String> getAliases() {
        Set<String> result = new HashSet<>();
        result.add(getIdentifier());
        result.add(STANDARD_NAME);
        result.add("OMIM");
        return result;
    }

    @Override
    public String getWebsite() {
        return "http://www.omim.org/";
    }

    @Override
    public String getCitation() {
        return "Online Mendelian Inheritance in Man, OMIM\u00ae. McKusick-Nathans Institute of Genetic Medicine,"
                + " Johns Hopkins University (Baltimore, MD)";
    }

    @Override
    protected Collection<SolrInputDocument> load(URL url) {
        parseOmimData(url);
        loadGenes();
        loadGeneReviews();
        loadVersion();
        return this.data.values();
    }

    @Override
    public VocabularyTerm getTerm(String id) {
        VocabularyTerm result = super.getTerm(id);
        if (result == null) {
            String optionalPrefix = STANDARD_NAME + ":";
            if (StringUtils.startsWith(id, optionalPrefix)) {
                result = getTerm(StringUtils.substringAfter(id, optionalPrefix));
            }
        }
        return result;
    }

    @Override
    public List<VocabularyTerm> search(String input, int maxResults, String sort, String customFilter) {
        if (StringUtils.isBlank(input)) {
            return Collections.emptyList();
        }
        SolrQuery query = new SolrQuery();
        addFieldQueryParameters(addGlobalQueryParameters(query));
        List<VocabularyTerm> result = new LinkedList<>();
        for (SolrDocument doc : search(addDynamicQueryParameters(input, maxResults, sort, customFilter, query))) {
            result.add(new SolrVocabularyTerm(doc, this));
        }
        return result;
    }

    private SolrQuery addGlobalQueryParameters(SolrQuery query) {
        query.set("spellcheck", Boolean.toString(true));
        query.set(SpellingParams.SPELLCHECK_COLLATE, Boolean.toString(true));
        query.set(SpellingParams.SPELLCHECK_COUNT, "100");
        query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "3");
        query.set("lowercaseOperators", Boolean.toString(false));
        query.set("defType", "edismax");
        return query;
    }

    private SolrQuery addFieldQueryParameters(SolrQuery query) {
        query.set(DisMaxParams.PF, "name^40 nameSpell^70 synonym^15 synonymSpell^25 "
                + "included_name^15 included_nameSpell^25 text^3 textSpell^5");
        query.set(DisMaxParams.QF, "id^40 name^10 nameSpell^18 nameStub^5 " + "short_name^18 short_nameStub^5 "
                + "synonym^6 synonymSpell^10 synonymStub^3 "
                + "included_name^6 included_nameSpell^10 included_nameStub^3 " + "text^1 textSpell^2 textStub^0.5");
        return query;
    }

    private SolrQuery addDynamicQueryParameters(String originalQuery, Integer rows, String sort, String customFq,
            SolrQuery query) {
        String queryString = originalQuery.trim();
        String escapedQuery = ClientUtils.escapeQueryChars(queryString);
        query.setFilterQueries(StringUtils.defaultIfBlank(customFq, "+type:disorder"));
        query.setQuery(escapedQuery);
        query.set(SpellingParams.SPELLCHECK_Q, queryString);
        String lastWord = StringUtils.substringAfterLast(escapedQuery, " ");
        if (StringUtils.isBlank(lastWord)) {
            lastWord = escapedQuery;
        }
        lastWord += "*";
        query.set(DisMaxParams.BQ,
                String.format(
                        "nameSpell:%1$s^20 short_name:%1$s^20 synonymSpell:%1$s^12 text:%1$s^1 textSpell:%1$s^2",
                        lastWord));
        query.setRows(rows);
        if (StringUtils.isNotBlank(sort)) {
            for (String sortItem : sort.split("\\s*,\\s*")) {
                query.addSort(StringUtils.substringBefore(sortItem, " "),
                        sortItem.endsWith(" desc") || sortItem.startsWith("-") ? ORDER.desc : ORDER.asc);
            }
        }
        return query;
    }

    private void parseOmimData(URL sourceUrl) {
        try {
            Reader in = new InputStreamReader(sourceUrl.openConnection().getInputStream(),
                    Charset.forName(ENCODING));
            for (CSVRecord row : CSVFormat.TDF.withCommentMarker('#').parse(in)) {
                // Ignore moved or removed entries
                if ("Caret".equals(row.get(0))) {
                    continue;
                }

                SolrInputDocument crtTerm = new SolrInputDocument();
                // set id
                addFieldValue(ID_FIELD, row.get(1), crtTerm);

                // set symbol
                addFieldValue(SYMBOL_FIELD, SYMBOLS.get(row.get(0)), crtTerm);
                // set type (multivalued)
                for (String type : TYPES.get(row.get(0))) {
                    addFieldValue(TYPE_FIELD, type, crtTerm);
                }
                // set name
                String name = StringUtils.substringBefore(row.get(2), TITLE_SEPARATOR).trim();
                addFieldValue(NAME_FIELD, name, crtTerm);
                // set short name
                String shortNameString = StringUtils.substringAfter(row.get(2), TITLE_SEPARATOR).trim();
                String[] shortNames = StringUtils.split(shortNameString, TITLE_SEPARATOR);
                for (String shortName : shortNames) {
                    addFieldValue(SHORT_NAME_FIELD, shortName.trim(), crtTerm);
                }

                // set synonyms
                setListFieldValue(SYNONYM_FIELD, row.get(3), crtTerm);
                // set included name
                setListFieldValue(INCLUDED_NAME_FIELD, row.get(4), crtTerm);

                this.data.put(String.valueOf(crtTerm.get(ID_FIELD).getFirstValue()), crtTerm);
            }
        } catch (IOException ex) {
            this.logger.warn("Failed to read/parse the OMIM source: {}", ex.getMessage());
        }
    }

    private void setListFieldValue(String targetField, String value, SolrInputDocument doc) {
        if (StringUtils.isNotBlank(value)) {
            String[] items = StringUtils.split(value, LIST_SEPARATOR);
            for (String item : items) {
                addFieldValue(targetField, item.replaceAll(", INCLUDED$", "").trim(), doc);
            }
        }
    }

    private void addFieldValue(String targetField, String value, SolrInputDocument doc) {
        if (StringUtils.isNotBlank(value)) {
            doc.addField(targetField, value);
        }
    }

    private void loadGenes() {
        try (BufferedReader in = new BufferedReader(
                new InputStreamReader(new URL(GENE_ANNOTATIONS_URL).openConnection().getInputStream(), ENCODING))) {
            for (CSVRecord row : CSVFormat.TDF.withCommentMarker('#').parse(in)) {
                SolrInputDocument term = this.data.get(row.get(0).trim());
                if (term != null) {
                    String gs = row.get(3).trim();
                    if (StringUtils.isNotBlank(gs)) {
                        term.addField(GENE_FIELD, gs);
                    }
                    String eidLine = row.get(4).trim();
                    if (StringUtils.isNotBlank(eidLine)) {
                        String[] eids = StringUtils.split(eidLine, ",");
                        for (String eid : eids) {
                            term.addField(GENE_FIELD, eid.trim());
                        }
                    }
                }
            }
        } catch (IOException ex) {
            this.logger.error("Failed to load OMIM-Gene links: {}", ex.getMessage(), ex);
        }
    }

    private void loadGeneReviews() {
        try (BufferedReader in = new BufferedReader(new InputStreamReader(
                new URL(GENEREVIEWS_MAPPING_URL).openConnection().getInputStream(), ENCODING))) {
            for (CSVRecord row : CSVFormat.TDF.withHeader().parse(in)) {
                SolrInputDocument term = this.data.get(row.get(2));
                if (term != null) {
                    term.setField("gene_reviews_link", "https://www.ncbi.nlm.nih.gov/books/" + row.get(0));
                }
            }
        } catch (IOException ex) {
            this.logger.error("Failed to load OMIM-GeneReviews links: {}", ex.getMessage(), ex);
        }
    }

    private void loadVersion() {
        SolrInputDocument metaTerm = new SolrInputDocument();
        metaTerm.addField(ID_FIELD, "HEADER_INFO");
        metaTerm.addField("version", ISODateTimeFormat.dateTime().withZoneUTC().print(new DateTime()));
        this.data.put("VERSION", metaTerm);
    }
}