act.installer.pubchem.PubchemMeshSynonyms.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.pubchem.PubchemMeshSynonyms.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.pubchem;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.client.utils.URIBuilder;
import org.apache.jena.arq.querybuilder.SelectBuilder;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.query.Query;
import org.apache.jena.query.QueryExecution;
import org.apache.jena.query.QueryExecutionFactory;
import org.apache.jena.query.QuerySolution;
import org.apache.jena.query.ResultSet;
import org.apache.jena.sparql.core.Var;
import org.apache.jena.vocabulary.RDF;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***
 * The PubchemMeshSynonyms class provides an API to get Pubchem synonyms and MeSH terms given an InChI string.
 * It assumes that a Virtuoso SPARQL endpoint is running on a connected server, port 8890 and that the necessary data
 * has been loaded in.  The default is connecting to 10.0.20.19 (Chimay).
 * TODO(thomas): create a Wiki page describing the Virtuoso setup and how to load the data
 * The HTML server can be accessed at http://<LOCAL IP OR Alias>:8890/sparql, with a UI to run SPARQL queries. Try it!
 */

public class PubchemMeshSynonyms {

    private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemMeshSynonyms.class);

    public static final String OPTION_SERVICE_HOST = "s";
    public static final String OPTION_SERVICE_PORT = "p";
    public static final String OPTION_QUERY_INCHI = "i";

    public static final String HELP_MESSAGE = "This class provides an API to get Pubchem synonyms and MeSH terms given an InChI string.";

    // The Virtuoso SPARQL endpoint, lives by default on Chimay at port 8890
    private static final String DEFAULT_SERVICE_HOST = "localhost"; // chimay
    private static final String DEFAULT_SERVICE_PORT = "8890";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_SERVICE_HOST).argName("SERVICE_HOST")
                    .desc("The SPARQL server host. Default is " + DEFAULT_SERVICE_HOST).hasArg()
                    .longOpt("service-host").type(String.class));
            add(Option.builder(OPTION_SERVICE_PORT).argName("SERVICE_PORT")
                    .desc("The SPARQL server host's port. Default is " + DEFAULT_SERVICE_PORT).hasArg()
                    .longOpt("service-port").type(Integer.class));
            add(Option.builder(OPTION_QUERY_INCHI).argName("QUERY_INCHI")
                    .desc("The InChI string to fetch synonyms for. "
                            + "For example, InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10) representing APAP")
                    .hasArg().required().longOpt("query-inchi").type(String.class));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    private static final String CID_PATTERN = "CID\\d+";
    private static final String ENGLISH_LANG_TAG = "en";

    private String sparqlService;

    /*
    The CID_QUERY_TMPL SelectBuilder constructs SPARQL queries like the below one:
    #########
    PREFIX  sio:  <http://semanticscience.org/resource/>
    SELECT DISTINCT  ?inchi_iri
    FROM <http://rdf.ncbi.nlm.nih.gov/pubchem/descriptor/compound>
    WHERE
    { ?inchi_iri  sio:has-value  "InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)"@en }
    #########
    */
    private static final SelectBuilder CID_QUERY_TMPL = new SelectBuilder()
            // PREFIX (shorthands for IRI namespaces which can be looked up on http://prefix.cc)
            .addPrefix("sio", "http://semanticscience.org/resource/")
            // SELECT
            .setDistinct(true).addVar("inchi_iri")
            // FROM
            .from("http://rdf.ncbi.nlm.nih.gov/pubchem/descriptor/compound")
            // WHERE
            .addWhere("?inchi_iri", "sio:has-value", "?inchi_string");

    /*
    The PUBCHEM_SYNO_QUERY_TMPL SelectBuilder constructs SPARQL queries like the below one:
    #########
    PREFIX  sio:  <http://semanticscience.org/resource/>
    PREFIX  rdf:  <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX  compound: <http://rdf.ncbi.nlm.nih.gov/pubchem/compound/>
        
    SELECT DISTINCT  ?value ?type
    FROM <http://rdf.ncbi.nlm.nih.gov/pubchem/synonym>
    WHERE
      { ?syno  sio:is-attribute-of  compound:CID1983 ;
         rdf:type             ?type ;
         sio:has-value        ?value
      }
    #########
    */
    private static final SelectBuilder PUBCHEM_SYNO_QUERY_TMPL = new SelectBuilder()
            // PREFIX (shorthands for IRI namespaces which can be looked up on http://prefix.cc)
            .addPrefix("sio", "http://semanticscience.org/resource/")
            .addPrefix("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
            .addPrefix("compound", "http://rdf.ncbi.nlm.nih.gov/pubchem/compound/")
            // SELECT
            .setDistinct(true).addVar("value").addVar("type")
            // FROM
            .from("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym")
            // WHERE
            .addWhere("?syno", "sio:is-attribute-of", "?compound").addWhere("?syno", RDF.type, "?type")
            .addWhere("?syno", "sio:has-value", "?value");

    /*
    The MESH_TERMS_QUERY_TMPL SelectBuilder constructs SPARQL queries like the below one:
    #########
    PREFIX  sio:  <http://semanticscience.org/resource/>
    PREFIX  dcterms: <http://purl.org/dc/terms/>
    PREFIX  rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX  compound: <http://rdf.ncbi.nlm.nih.gov/pubchem/compound/>
    PREFIX  meshv: <http://id.nlm.nih.gov/mesh/vocab#>
        
    SELECT DISTINCT  ?concept_label ?lexical_tag
    FROM <http://rdf.ncbi.nlm.nih.gov/pubchem/synonym>
    FROM <http://id.nlm.nih.gov/mesh/>
    WHERE
      { ?syno     sio:is-attribute-of  compound:CID1983 ;
            dcterms:subject      ?mesh_concept .
        ?mesh_concept
            rdfs:label           ?concept_label ;
            meshv:preferredTerm  ?mesh_term .
        ?mesh_term  meshv:lexicalTag   ?lexical_tag
      }
    #########
    */
    private static final SelectBuilder MESH_TERMS_QUERY_TMPL = new SelectBuilder()
            // PREFIX (shorthands for IRI namespaces which can be looked up on http://prefix.cc)
            .addPrefix("sio", "http://semanticscience.org/resource/")
            .addPrefix("rdfs", "http://www.w3.org/2000/01/rdf-schema#")
            .addPrefix("compound", "http://rdf.ncbi.nlm.nih.gov/pubchem/compound/")
            .addPrefix("dcterms", "http://purl.org/dc/terms/")
            .addPrefix("meshv", "http://id.nlm.nih.gov/mesh/vocab#")
            // SELECT
            .setDistinct(true).addVar("concept_label").addVar("lexical_tag")
            // FROM
            .from("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym").from("http://id.nlm.nih.gov/mesh/")
            // WHERE
            .addWhere("?syno", "sio:is-attribute-of", "?compound")
            .addWhere("?syno", "dcterms:subject", "?mesh_concept")
            .addWhere("?mesh_concept", "rdfs:label", "?concept_label")
            .addWhere("?mesh_concept", "meshv:preferredTerm", "?mesh_term")
            .addWhere("?mesh_term", "meshv:lexicalTag", "?lexical_tag");

    public static void main(final String[] args) {

        // Parse the command line options
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(PubchemMeshSynonyms.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(PubchemMeshSynonyms.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        String serviceHostIp = cl.getOptionValue(OPTION_SERVICE_HOST, DEFAULT_SERVICE_HOST);
        Integer servicePort = Integer.parseInt(cl.getOptionValue(OPTION_SERVICE_PORT, DEFAULT_SERVICE_PORT));
        String queryInchi = cl.getOptionValue(OPTION_QUERY_INCHI);

        PubchemMeshSynonyms pubchemMeshSynonyms = new PubchemMeshSynonyms(serviceHostIp, servicePort);
        String cid = pubchemMeshSynonyms.fetchCIDFromInchi(queryInchi);
        if (cid != null) {
            Map<PubchemSynonymType, Set<String>> pubchemSynonyms = pubchemMeshSynonyms
                    .fetchPubchemSynonymsFromCID(cid);
            LOGGER.info("Resulting Pubchem synonyms for %s are: %s", queryInchi, pubchemSynonyms);
            Map<MeshTermType, Set<String>> meshTerms = pubchemMeshSynonyms.fetchMeshTermsFromCID(cid);
            LOGGER.info("Resulting MeSH term s for %s are: %s", queryInchi, meshTerms);
        } else {
            LOGGER.info("No PubChem compound ID was found for the input InChI.");
        }
    }

    public PubchemMeshSynonyms() {
        sparqlService = getServiceFromHostParams(DEFAULT_SERVICE_HOST, Integer.parseInt(DEFAULT_SERVICE_PORT));
    }

    public PubchemMeshSynonyms(String hostIp, Integer port) {
        sparqlService = getServiceFromHostParams(hostIp, port);
    }

    private String getServiceFromHostParams(String hostIp, Integer port) {
        URI uri = null;
        try {
            uri = new URIBuilder().setScheme("http").setHost(hostIp).setPort(port).setPath("/sparql").build();

        } catch (URISyntaxException e) {
            String msg = String.format("An error occurred when trying to build the SPARQL service URI: %s",
                    e.getMessage());
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }
        LOGGER.debug("Constructed the following URL for SPARQL service: %s", uri.toString());
        return uri != null ? uri.toString() : null;
    }

    public String fetchCIDFromInchi(String inchi) {
        // The clone method has its own implementation in the SelectBuilder. Thus safe to use!
        SelectBuilder sb = CID_QUERY_TMPL.clone();
        // The inchi litteral needs to be create with a language tag, otherwise it will not match anything
        // See "Matching Litteral with Language Tags" (https://www.w3.org/TR/rdf-sparql-query/#matchLangTags)
        // for more information
        sb.setVar(Var.alloc("inchi_string"), NodeFactory.createLiteral(inchi, ENGLISH_LANG_TAG));
        Query query = sb.build();

        String result;
        LOGGER.debug("Executing SPARQL query: %s", query.toString());
        try (QueryExecution qexec = QueryExecutionFactory.sparqlService(sparqlService, query)) {
            ResultSet results = qexec.execSelect();
            // TODO: we assume here that there is at most one CID per InChI and return the first CID
            // Improve that behavior so we can stitch together many CID's synonyms.
            if (!results.hasNext()) {
                LOGGER.info("Could not find Pubchem Compound Id for input InChI %s", inchi);
                return null;
            }
            result = results.nextSolution().getResource("inchi_iri").getLocalName();
        }

        String cid = extractCIDFromResourceName(result);
        LOGGER.info("Found Pubchem Compound Id %s for input InChI %s", cid, inchi);
        return cid;
    }

    private String extractCIDFromResourceName(String resourceName) {
        Pattern p = Pattern.compile(CID_PATTERN);
        Matcher m = p.matcher(resourceName);
        String cid = null;
        if (m.find()) {
            cid = m.group(0);
        }
        return cid;
    }

    public Map<PubchemSynonymType, Set<String>> fetchPubchemSynonymsFromCID(String cid) {
        // The clone method has its own implementation in the SelectBuilder. Thus safe to use!
        SelectBuilder sb = PUBCHEM_SYNO_QUERY_TMPL.clone();
        sb.setVar(Var.alloc("compound"), String.format("compound:%s", cid));
        Query query = sb.build();
        LOGGER.debug("Executing SPARQL query: %s", query.toString());
        Map<PubchemSynonymType, Set<String>> map = new HashMap<>();

        try (QueryExecution queryExecution = QueryExecutionFactory.sparqlService(sparqlService, query)) {
            ResultSet results = queryExecution.execSelect();
            while (results.hasNext()) {
                QuerySolution solution = results.nextSolution();
                String cheminfId = solution.getResource("type").getLocalName();
                String synonym = solution.getLiteral("value").getString();
                LOGGER.debug("Found synonym %s with type %s", synonym, cheminfId);
                PubchemSynonymType synonymType = PubchemSynonymType.getByCheminfId(cheminfId);
                Set synonyms = map.get(synonymType);
                if (synonyms == null) {
                    synonyms = new HashSet<>();
                    map.put(synonymType, synonyms);
                }
                synonyms.add(synonym);
            }
        }
        return map;
    }

    public Map<MeshTermType, Set<String>> fetchMeshTermsFromCID(String cid) {
        // The clone method has its own implementation in the SelectBuilder. Thus safe to use!
        SelectBuilder sb = MESH_TERMS_QUERY_TMPL.clone();
        sb.setVar(Var.alloc("compound"), String.format("compound:%s", cid));
        Query query = sb.build();
        LOGGER.debug("Executing SPARQL query: %s", query.toString());
        Map<MeshTermType, Set<String>> map = new HashMap<>();

        try (QueryExecution queryExecution = QueryExecutionFactory.sparqlService(sparqlService, query)) {
            ResultSet results = queryExecution.execSelect();
            while (results.hasNext()) {
                QuerySolution solution = results.nextSolution();
                String conceptLabel = solution.getLiteral("concept_label").getString();
                String lexicalTag = solution.getLiteral("lexical_tag").getString();
                LOGGER.debug("Found term %s with tag %s", conceptLabel, lexicalTag);
                MeshTermType meshTermsType = MeshTermType.getByLexicalTag(lexicalTag);
                Set synonyms = map.get(meshTermsType);
                if (synonyms == null) {
                    synonyms = new HashSet<>();
                    map.put(meshTermsType, synonyms);
                }
                synonyms.add(conceptLabel);
            }
        }
        return map;
    }
}