org.phenotips.vocabulary.internal.solr.OmimSourceParser.java Source code

Introduction

Here is the source code for org.phenotips.vocabulary.internal.solr.OmimSourceParser.java
Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */
package org.phenotips.vocabulary.internal.solr;

import org.phenotips.vocabulary.Vocabulary;
import org.phenotips.vocabulary.VocabularyTerm;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.common.SolrInputDocument;
import org.joda.time.DateTime;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class for parsing the special OMIM source, while also gathering annotations from other sources: OMIM-Gene symbols
 * mapping from OMIM, OMIM-Phenotype mapping from HPO, OMIM-GeneReviews mapping from NCBI.
 *
 * @version $Id: 42e9d61c40e64044c6192f897002e20418949b74 $
 * @since 1.3M1
 */
public class OmimSourceParser {
    /** The location for the official OMIM source. */
    public static final String OMIM_SOURCE_URL = "ftp://ftp.omim.org/OMIM/omim.txt.Z";

    private static final String RECORD_MARKER = "*RECORD*";

    private static final String FIELD_MARKER = "*FIELD* ";

    private static final String FIELD_MIM_NUMBER = "NO";

    private static final String FIELD_TITLE = "TI";

    private static final String FIELD_TEXT = "TX";

    private static final String END_MARKER = "*THEEND*";

    private static final String TITLE_SEPARATOR = ";;";

    private static final String ANNOTATIONS_BASE_URL = "http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/";

    private static final String GENE_ANNOTATIONS_URL = "http://omim.org/static/omim/data/mim2gene.txt";

    private static final String POSITIVE_ANNOTATIONS_URL = ANNOTATIONS_BASE_URL + "phenotype_annotation.tab";

    private static final String NEGATIVE_ANNOTATIONS_URL = ANNOTATIONS_BASE_URL
            + "negative_phenotype_annotation.tab";

    private static final String GENEREVIEWS_MAPPING_URL = "ftp://ftp.ncbi.nih.gov/pub/GeneReviews/NBKid_shortname_OMIM.txt";

    private static final String ENCODING = "UTF-8";

    private static final String ID_FIELD = "id";

    private static final String NAME_FIELD = "name";

    private static final String SYNONYM_FIELD = "synonym";

    private static final String GENE_FIELD = "GENE";

    private SolrInputDocument crtTerm;

    private Map<String, SolrInputDocument> data = new HashMap<>();

    private Logger logger = LoggerFactory.getLogger(OmimSourceParser.class);

    private Vocabulary hpo;

    /**
     * Constructor which prepares the vocabulary data, parsing OMIM from the official site.
     *
     * @param hpo the HPO vocabulary, needed for computing the ancestors for the MIM-Phenotype mapping
     */
    public OmimSourceParser(Vocabulary hpo) {
        this(hpo, OMIM_SOURCE_URL);
    }

    /**
     * Constructor which prepares the vocabulary data, parsing OMIM from the specified source file.
     *
     * @param hpo the HPO vocabulary, needed for computing the ancestors for the MIM-Phenotype mapping
     * @param sourceURL the location from which to fetch the OMIM source, as a zipped file; may be a ftp, http, or local
     *            file URL
     */
    public OmimSourceParser(Vocabulary hpo, String sourceURL) {
        this.hpo = hpo;
        try (BufferedReader in = new BufferedReader(new InputStreamReader(new CompressorStreamFactory()
                .createCompressorInputStream(new URL(sourceURL).openConnection().getInputStream()), ENCODING))) {
            transform(in);
            loadGenes();
            loadSymptoms(true);
            loadSymptoms(false);
            loadGeneReviews();
            loadVersion();
        } catch (NullPointerException | CompressorException | IOException ex) {
            this.logger.error("Failed to prepare the OMIM index: {}", ex.getMessage(), ex);
        }
    }

    /**
     * Return the parsed vocabulary data.
     *
     * @return the vocabulary data, may be an empty collection if parsing the source failed
     */
    public Collection<SolrInputDocument> getData() {
        return this.data.values();
    }

    private Map<String, SolrInputDocument> transform(BufferedReader in) throws IOException {
        String line;
        StringBuilder fieldValue = new StringBuilder();
        String fieldName = null;
        while ((line = in.readLine()) != null) {
            if (RECORD_MARKER.equalsIgnoreCase(line) || END_MARKER.equalsIgnoreCase(line)) {
                if (this.crtTerm != null) {
                    loadField(fieldName, fieldValue.toString().trim());
                    storeCrtTerm();
                } else {
                    this.crtTerm = new SolrInputDocument();
                }
            } else if (line.startsWith(FIELD_MARKER)) {
                loadField(fieldName, fieldValue.toString().trim());
                fieldValue.setLength(0);
                fieldName = line.substring(FIELD_MARKER.length());
            } else {
                fieldValue.append(line.trim()).append(' ');
            }
        }

        return this.data;
    }

    private void storeCrtTerm() {
        this.data.put(String.valueOf(this.crtTerm.get(ID_FIELD).getFirstValue()), this.crtTerm);
        this.crtTerm = new SolrInputDocument();
    }

    private void loadField(String name, String value) {
        if (StringUtils.isAnyBlank(name, value)) {
            return;
        }
        switch (name) {
        case FIELD_MIM_NUMBER:
            this.crtTerm.setField(ID_FIELD, value);
            break;
        case FIELD_TITLE:
            String title = StringUtils.substringBefore(value, TITLE_SEPARATOR).trim();
            String[] synonyms = StringUtils.split(StringUtils.substringAfter(value, TITLE_SEPARATOR),
                    TITLE_SEPARATOR);
            this.crtTerm.setField(NAME_FIELD, title);
            for (String synonym : synonyms) {
                this.crtTerm.addField(SYNONYM_FIELD, synonym.trim());
            }
            break;
        case FIELD_TEXT:
            this.crtTerm.addField("def", value);
            break;
        default:
            return;
        }
    }

    private void loadSymptoms(boolean positive) {
        String omimId = "";
        String previousOmimId = null;
        Set<String> ancestors = new HashSet<>();
        try (BufferedReader in = new BufferedReader(
                new InputStreamReader(new URL(positive ? POSITIVE_ANNOTATIONS_URL : NEGATIVE_ANNOTATIONS_URL)
                        .openConnection().getInputStream(), ENCODING))) {
            for (CSVRecord row : CSVFormat.TDF.parse(in)) {
                if ("OMIM".equals(row.get(0))) {
                    omimId = row.get(1);
                    addAncestors(previousOmimId, omimId, ancestors, positive);
                    previousOmimId = omimId;
                    SolrInputDocument term = this.data.get(omimId);
                    if (term != null) {
                        term.addField(positive ? "actual_symptom" : "actual_not_symptom", row.get(4));
                    }
                    VocabularyTerm vterm = this.hpo.getTerm(row.get(4));
                    if (vterm != null) {
                        for (VocabularyTerm ancestor : vterm.getAncestorsAndSelf()) {
                            ancestors.add(ancestor.getId());
                        }
                    }
                }
            }
            addAncestors(omimId, null, ancestors, positive);
        } catch (IOException ex) {
            this.logger.error("Failed to load OMIM-HPO links: {}", ex.getMessage(), ex);
        }
    }

    private void addAncestors(String previousOmimId, String newOmimId, Set<String> ancestors, boolean positive) {
        if (previousOmimId == null || previousOmimId.equals(newOmimId)) {
            return;
        }
        final String symptomField = "symptom";
        SolrInputDocument term = this.data.get(previousOmimId);
        if (!positive) {
            ancestors.removeAll(term.getFieldValues(symptomField));
            term.addField("not_symptom", new HashSet<String>(ancestors));
        } else {
            term.addField(symptomField, new HashSet<String>(ancestors));
        }
        ancestors.clear();
    }

    private void loadGenes() {
        final String missing = "-";
        try (BufferedReader in = new BufferedReader(
                new InputStreamReader(new URL(GENE_ANNOTATIONS_URL).openConnection().getInputStream(), ENCODING))) {
            for (CSVRecord row : CSVFormat.TDF.withHeader().parse(in)) {
                if (!row.get("Type").contains("gene")) {
                    continue;
                }
                SolrInputDocument term = this.data.get(row.get(2));
                if (term != null) {
                    String gs = row.get("Approved Gene Symbol");
                    if (!missing.equals(gs)) {
                        term.addField(GENE_FIELD, gs);
                    }
                    String eid = row.get("Ensembl Gene ID");
                    if (!missing.equals(eid)) {
                        term.addField(GENE_FIELD, eid);
                    }
                }
            }
        } catch (IOException ex) {
            this.logger.error("Failed to load OMIM-Gene links: {}", ex.getMessage(), ex);
        }
    }

    private void loadGeneReviews() {
        try (BufferedReader in = new BufferedReader(new InputStreamReader(
                new URL(GENEREVIEWS_MAPPING_URL).openConnection().getInputStream(), ENCODING))) {
            for (CSVRecord row : CSVFormat.TDF.withHeader().parse(in)) {
                SolrInputDocument term = this.data.get(row.get(2));
                if (term != null) {
                    term.setField("gene_reviews_link", "https://www.ncbi.nlm.nih.gov/books/" + row.get(0));
                }
            }
        } catch (IOException ex) {
            this.logger.error("Failed to load OMIM-GeneReviews links: {}", ex.getMessage(), ex);
        }
    }

    private void loadVersion() {
        SolrInputDocument metaTerm = new SolrInputDocument();
        metaTerm.addField(ID_FIELD, "HEADER_INFO");
        metaTerm.addField("version", ISODateTimeFormat.dateTime().withZoneUTC().print(new DateTime()));
        this.data.put("VERSION", metaTerm);
    }
}