uk.ac.ebi.metabolomes.webservices.EUtilsWebServiceConnection.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ebi.metabolomes.webservices.EUtilsWebServiceConnection.java

Source

/*
 * Copyright (C) 2013 Pablo Moreno <pablacious at users.sf.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package uk.ac.ebi.metabolomes.webservices;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.sun.jersey.api.client.Client;
import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.WebResource;
import com.sun.jersey.core.util.MultivaluedMapImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import uk.ac.ebi.mdk.domain.annotation.crossreference.CrossReference;
import uk.ac.ebi.mdk.domain.identifier.PubChemCompoundIdentifier;
import uk.ac.ebi.mdk.domain.identifier.PubChemSubstanceIdentifier;
import uk.ac.ebi.metabolomes.webservices.eutils.*;

import javax.ws.rs.core.MultivaluedMap;
import javax.xml.stream.XMLStreamException;
import javax.xml.ws.WebServiceException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @name    EUtilsWebServiceConnection
 * @date    2011.08.04
 * @version $Rev$ : Last Changed $Date$
 * @author  pmoreno
 * @author  $Author$ (this version)
 * @brief   Web service to access the E-Utils service from NCBI. Relies on Jersey Client. 
 * WARNING: see NCBI usage policies: 
 *
 */
public class EUtilsWebServiceConnection extends NCBIJerseyBasedWebService {

    private static final Logger LOGGER = Logger.getLogger(EUtilsWebServiceConnection.class);
    private Client client;
    private WebResource webResource;

    /**
     * Makes sure that we don't submit more entries than the imposed by the NCBI. It throws an exception if more than
     * 5000 entities are being submitted at once.
     * 
     * @param dbFromIds
     * @throws javax.xml.ws.WebServiceException
     */
    private void checkNumberOfSubmittedEntries(Collection<String> dbFromIds) throws WebServiceException {
        if (dbFromIds.size() > MAX_RECORDS_PER_QUERY)
            throw new WebServiceException("More than 5000 entries submitted, this is not permitted.... submitted "
                    + dbFromIds.size() + " entries.");
    }

    private Pattern webEnvPattern = Pattern.compile("<WebEnv>(\\S+)<\\/WebEnv>");
    private Pattern queryKeyPattern = Pattern.compile("<QueryKey>(\\S+)<\\/QueryKey>");

    private EPostResult parseEpostOutput(InputStream entityInputStream) throws IOException {
        EPostResult res = new EPostResult();
        BufferedReader reader = new BufferedReader(new InputStreamReader(entityInputStream));
        String line;
        while ((line = reader.readLine()) != null) {
            Matcher matcher = webEnvPattern.matcher(line);
            if (matcher.find()) {
                res.setWebEnv(matcher.group(1));
                continue;
            }
            matcher = queryKeyPattern.matcher(line);
            if (matcher.find()) {
                res.setQueryKey(matcher.group(1));
                continue;
            }
        }
        return res;
    }

    public List<PubChemBioAssayESummaryResult> getPubChemBioassaysSummaries(List<String> aids) {
        InputStream response = submitESummaryQuery(EntrezDB.pcassay, aids);

        PubChemBioAssayESummaryXMLParser parser = new PubChemBioAssayESummaryXMLParser();
        List<PubChemBioAssayESummaryResult> results = new ArrayList<PubChemBioAssayESummaryResult>();

        try {
            results = parser.parseESummaryResult(response);
        } catch (XMLStreamException e) {
            LOGGER.error("Could not parse response for PubChem BioAssays Summaries", e);
        }

        return results;
    }

    public enum EntrezDB {

        pccompound, pcsubstance, mesh, pubmed, pcassay;
    }

    private final String baseURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/";

    public EUtilsWebServiceConnection() {
        client = Client.create();
        webResource = client.resource(baseURL);
    }

    /**
     * Queries the NCBI EUtils web service to retrieve through elink.cgi all the associations existing in a database
     * (dbto) for the identifiers provided for an initial different database (dbfrom). The one to one associations are
     * stored in the Multimap returned. Not more than 5,000 ids should be submitted at once.
     *
     * @param dbFromIds the list of string identifiers to search for
     * @param dbFrom    the database in Entrez corresponding to those identifiers
     * @param dbTo      the database where we want to find hits.
     * @return          multimap with all the one-to-many associations fromDB identifiers (keys) -to- toDB identifiers (values).
     * @throws javax.jws.WebService exception if more than the allowed number of entries were submitted.
     */
    public Multimap<String, String> getDBToIDsFromDBFromIDs(List<String> dbFromIds, EntrezDB dbFrom, EntrezDB dbTo,
            String addTerm, String addTermValue) throws WebServiceException {
        checkNumberOfSubmittedEntries(dbFromIds);
        WebResource webRes = client.resource(baseURL + "elink.fcgi");
        MultivaluedMap queryParams = new MultivaluedMapImpl();
        queryParams.add("dbfrom", dbFrom.toString());
        queryParams.add("db", dbTo.toString());
        if (addTerm != null && addTermValue != null)
            queryParams.add(addTerm, addTermValue);
        for (String id : dbFromIds) {
            queryParams.add("id", id);
        }

        ClientResponse resp = submitPost(webRes, queryParams);

        if (resp.getStatus() != 200) {
            throw new RuntimeException("Failed : HTTP error code : " + resp.getStatus());
        }
        Multimap<String, String> res = HashMultimap.create();
        // mapping should be one to many but there shouldn't be any replicated
        ELinkXMLResponseParser elinkXMLResponseParser = new ELinkXMLResponseParser();
        try {
            res.putAll(elinkXMLResponseParser.parseLinkSetBlock(resp.getEntityInputStream()));
        } catch (XMLStreamException ex) {
            LOGGER.warn("Could not parse output XML adequately...", ex);
        }

        return res;
    }

    /**
     * Queries the NCBI EUtils web service to retrieve through esearch.cgi all the associations existing in a database
     * (dbto) for the query term provided.
     *
     * @param db the list of string identifiers to search for
     * @param term    the database in Entrez corresponding to those identifiers
     * @return          set with all the one-to-many associations fromDB identifiers (keys) -to- toDB identifiers (values).
     * @throws javax.jws.WebService exception if more than the allowed number of entries were submitted.
     */
    public Set<String> getDBIDsFromTermQuery(EntrezDB db, String term) throws WebServiceException {
        ClientResponse resp = getESearchClientResponse(db, term, MAX_RECORDS_PER_QUERY, 0);
        Set<String> ids = new HashSet<String>();
        ESearchXMLResponseParser parser = new ESearchXMLResponseParser();
        ESearchResult res;
        try {
            res = parser.parseESearchResult(resp.getEntityInputStream());
            ids.addAll(res.getIds());
            while (res.resultsMissing()) {
                res = parser.parseESearchResult(
                        getESearchClientResponse(db, term, MAX_RECORDS_PER_QUERY, res.getNextStart())
                                .getEntityInputStream());
                ids.addAll(res.getIds());
            }
        } catch (XMLStreamException e) {
            e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
        }

        return ids;
    }

    private ClientResponse getESearchClientResponse(EntrezDB db, String term, Integer retMax, Integer retStart) {
        WebResource webRes = client.resource(baseURL + "esearch.fcgi");
        MultivaluedMap queryParams = new MultivaluedMapImpl();
        queryParams.add("db", db.toString());
        queryParams.add("term", term);
        queryParams.add("retMax", retMax + "");
        queryParams.add("retStart", retStart + "");

        ClientResponse resp = submitPost(webRes, queryParams);

        if (resp.getStatus() != 200) {
            throw new RuntimeException("Failed : HTTP error code : " + resp.getStatus());
        }
        return resp;
    }

    public Multimap<String, String> getPubChemCompoundFromPubChemSubstanceIdents(
            List<PubChemSubstanceIdentifier> pubChemSubstanceIdentifiers) throws WebServiceException {
        List<String> pubchemSubs = new ArrayList<String>(pubChemSubstanceIdentifiers.size());
        for (PubChemSubstanceIdentifier pubChemSubstanceIdentifier : pubChemSubstanceIdentifiers) {
            pubchemSubs.add(pubChemSubstanceIdentifier.getAccession());
        }
        return getPubChemCompoundFromPubChemSubstance(pubchemSubs);
    }

    public Multimap<String, String> getPubChemSubstanceFromPubChemCompoundIdents(
            List<PubChemCompoundIdentifier> pubChemCompoundIdentifiers) throws WebServiceException {
        List<String> pubchemComps = new ArrayList<String>(pubChemCompoundIdentifiers.size());
        for (PubChemCompoundIdentifier pubChemCompoundIdentifier : pubChemCompoundIdentifiers) {
            pubchemComps.add(pubChemCompoundIdentifier.getAccession());
        }
        return getPubChemSubstanceFromPubChemCompound(pubchemComps);
    }

    public PubChemNamesResult getNamesForPubChemCompoundIdentifiers(
            Collection<PubChemCompoundIdentifier> pubchemCompoundIds) {
        Collection<String> pubchemComps = new ArrayList<String>(pubchemCompoundIds.size());
        for (PubChemCompoundIdentifier pubChemCompoundIdentifier : pubchemCompoundIds) {
            pubchemComps.add(pubChemCompoundIdentifier.getAccession());
        }
        return getNamesForPubChemCompounds(pubchemComps);
    }

    /**
     * Given a search term, this method produces all the PubChem BioAssays IDs that are produced by that search.
     *
     * @param search the query term
     * @return
     */
    public Set<String> getPubChemBioAssaysForTermSearch(String search) {
        return getDBIDsFromTermQuery(EntrezDB.pcassay, search);
    }

    public Multimap<String, String> getPubChemSubstanceFromPubChemCompound(List<String> pubchemCompoundIds)
            throws WebServiceException {
        return this.getDBToIDsFromDBFromIDs(pubchemCompoundIds, EntrezDB.pccompound, EntrezDB.pcsubstance,
                "linkname", "pccompound_pcsubstance_same");
    }

    public Multimap<String, String> getPubChemCompoundFromPubChemSubstance(List<String> pubchemSubstanceIds)
            throws WebServiceException {
        return this.getDBToIDsFromDBFromIDs(pubchemSubstanceIds, EntrezDB.pcsubstance, EntrezDB.pccompound,
                "linkname", "pcsubstance_pccompound_same");
    }

    public Multimap<String, String> getPubChemCompoundParentFromPubChemCompound(List<String> pubchemCompoundIds)
            throws WebServiceException {
        return this.getDBToIDsFromDBFromIDs(pubchemCompoundIds, EntrezDB.pccompound, EntrezDB.pccompound,
                "linkname", "pccompound_pccompound_parent");
    }

    public Multimap<String, String> getPubChemCompoundParentFromPubChemCompoundIdents(
            List<PubChemCompoundIdentifier> pubChemCompoundIdentifiers) throws WebServiceException {
        List<String> pubchemComps = new ArrayList<String>(pubChemCompoundIdentifiers.size());
        for (PubChemCompoundIdentifier pubChemCompoundIdentifier : pubChemCompoundIdentifiers) {
            pubchemComps.add(pubChemCompoundIdentifier.getAccession());
        }
        return this.getPubChemCompoundParentFromPubChemCompound(pubchemComps);
    }

    /**
     * Uses ESummary to submit a list of uk.ac.ebi.metabolomes.webservices.pubchem substance ids, for which cross references will be retrieved (taken from
     * actual cross references in uk.ac.ebi.metabolomes.webservices.pubchem + synonyms that comply with certain regexps).
     * 
     * @param pubchemSubstanceIds
     * @return multimap that links substance ids to crossreferences. 
     */
    public Multimap<String, CrossReference> getExternalIdentifiersForPubChemSubstances(
            Collection<String> pubchemSubstanceIds) throws WebServiceException {
        checkNumberOfSubmittedEntries(pubchemSubstanceIds);
        WebResource epostWebRes = client.resource(baseURL + "epost.fcgi");
        MultivaluedMap queryParamsEPost = new MultivaluedMapImpl();
        queryParamsEPost.add("db", "pcsubstance");
        /*for (String subsId : pubchemSubstanceIds) {
        queryParamsEPost.add("id", subsId);
        }*/
        queryParamsEPost.add("id", StringUtils.join(pubchemSubstanceIds, ","));
        ClientResponse respEpost = submitPost(epostWebRes, queryParamsEPost);

        if (respEpost.getStatus() != 200) {
            throw new RuntimeException(
                    "Failed : HTTP error code : " + respEpost.getStatus() + " : " + respEpost.toString());
        }

        /**
         * #parse WebEnv and QueryKey
         *  $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
         *  $key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);
         */

        Multimap<String, CrossReference> res = HashMultimap.create();

        EPostResult epostRes;
        try {
            epostRes = parseEpostOutput(respEpost.getEntityInputStream());
        } catch (IOException e) {
            LOGGER.error("Could not parse Epost output ", e);
            return res;
        }
        WebResource webRes = client.resource(baseURL + "esummary.fcgi");
        MultivaluedMap queryParams = new MultivaluedMapImpl();
        queryParams.add("db", "pcsubstance");
        queryParams.add("query_key", epostRes.getQueryKey());
        queryParams.add("WebEnv", epostRes.getWebEnv());
        //for (String id : pubchemSubstanceIds) {

        //}

        ClientResponse resp = submitPost(webRes, queryParams);

        LOGGER.info("Resp: " + resp.toString());

        if (resp.getStatus() != 200) {
            throw new RuntimeException("Failed : HTTP error code : " + resp.getStatus() + " : " + resp.toString());
        }

        PubChemSubstanceXMLResponseParser parser = new PubChemSubstanceXMLResponseParser();
        try {
            List<PubChemSubstanceESummaryResult> resultsParse = parser
                    .parseESummaryResult(resp.getEntityInputStream());
            for (PubChemSubstanceESummaryResult pubChemSubstanceESummaryResult : resultsParse) {
                res.putAll(pubChemSubstanceESummaryResult.getId(),
                        pubChemSubstanceESummaryResult.getCrossReferences());
            }
        } catch (XMLStreamException ex) {
            LOGGER.warn("Could not parse output XML adequately... returning empty result", ex);
        }
        return res;

    }

    /**
     * Submits an esummary requests with collection of IDs (no more than 5,000), to retrieve an input stream with the
     * result. This needs to parsed by an specialized parser.
     *
     * @param db - Entrez database to query.
     * @param ids - identifiers of the db to query.
     * @return inputstream with results to be parsed.
     */
    private InputStream submitESummaryQuery(EntrezDB db, Collection<String> ids) {
        WebResource webRes = client.resource(baseURL + "esummary.fcgi");
        MultivaluedMap queryParams = new MultivaluedMapImpl();
        queryParams.add("db", db.toString());
        queryParams.add("id", StringUtils.join(ids, ","));
        ClientResponse resp = submitPost(webRes, queryParams);

        LOGGER.info("Resp: " + resp.toString());

        if (resp.getStatus() != 200) {
            throw new RuntimeException("Failed : HTTP error code : " + resp.getStatus() + " : " + resp.toString());
        }

        return resp.getEntityInputStream();
    }

    /**
     * Given a set of PubChem Compound IDs (not more than 5000), this method returns the preferred name for each
     * one of the entries. For uk.ac.ebi.metabolomes.webservices.pubchem compound the prefer name tends to be the first one in the list of synonyms.
     * 
     * @param pubchemCompoundIds
     * @return cids 2 names map. 
     */
    public Map<String, String> getPreferredNameForPubChemCompounds(List<String> pubchemCompoundIds) {
        PubChemNamesResult result = getNamesForPubChemCompounds(pubchemCompoundIds);
        return result.getCompound2PreferredNameMap();
    }

    /**
     * Given a set of PubChem Compound IDs (not more than 5000), this method returns the preferred name for each
     * one of the entries. For uk.ac.ebi.metabolomes.webservices.pubchem compound the preferred name tends to be the first one in the list of synonyms.
     * 
     * @param pubchemCompoundIds
     * @return cids 2 names map. 
     */
    public PubChemNamesResult getNamesForPubChemCompounds(Collection<String> pubchemCompoundIds) {
        InputStream response = submitESummaryQuery(EntrezDB.pccompound, pubchemCompoundIds);

        PubChemCompoundXMLResponseParser parser = new PubChemCompoundXMLResponseParser();
        PubChemNamesResult result = new PubChemNamesResult();
        try {
            List<PubChemCompoundESummaryResult> resultsParse = parser.parseESummaryResult(response);
            for (PubChemCompoundESummaryResult pubChemSubstanceESummaryResult : resultsParse) {
                result.addPreferredName(pubChemSubstanceESummaryResult.getId(),
                        pubChemSubstanceESummaryResult.getPreferredName());
                result.addIUPACName(pubChemSubstanceESummaryResult.getId(),
                        pubChemSubstanceESummaryResult.getIUPACName());
                result.addSynonyms(pubChemSubstanceESummaryResult.getId(),
                        pubChemSubstanceESummaryResult.getSynonyms());
            }
        } catch (XMLStreamException ex) {
            LOGGER.warn("Could not parse output XML adequately... returning empty result", ex);
        }
        return result;
    }

    //    public PubMedResult getPubMedEntries(Collection<String> pmids) {
    //        WebResource webRes = client.resource(baseURL + "efetch.fcgi");
    //        MultivaluedMap queryParams = new MultivaluedMapImpl();
    //        queryParams.add("db", EntrezDB.pubmed.toString());
    //        queryParams.add("id", StringUtils.join(pmids, ","));
    //        ClientResponse resp = submitPost(webRes, queryParams);
    //        
    //        LOGGER.info("Resp: "+resp.toString());
    //
    //        if (resp.getStatus() != 200) {
    //            throw new RuntimeException("Failed : HTTP error code : "
    //                    + resp.getStatus() +" : "+ resp.toString());
    //        }
    //        
    //        PubMedTxtResponseParser parser = new PubMedTxtResponseParser
    //    }

    public MeSHResult getMeSHObjects(Collection<String> meshIds) {
        WebResource webRes = client.resource(baseURL + "esummary.fcgi");
        MultivaluedMap queryParams = new MultivaluedMapImpl();
        queryParams.add("db", "mesh");
        queryParams.add("id", StringUtils.join(meshIds, ","));
        ClientResponse resp = submitPost(webRes, queryParams);

        LOGGER.info("Resp: " + resp.toString());

        if (resp.getStatus() != 200) {
            throw new RuntimeException("Failed : HTTP error code : " + resp.getStatus() + " : " + resp.toString());
        }

        MeSHXMLResponseParser parser = new MeSHXMLResponseParser();
        MeSHResult result = new MeSHResult();
        try {
            List<MeSHESummaryResult> resultsParse = parser.parseESummaryResult(resp.getEntityInputStream());
            for (MeSHESummaryResult eSummaryResult : resultsParse) {
                result.addMeSHName(eSummaryResult.getID(), eSummaryResult.getMeSHTermName());
                result.addParent(eSummaryResult.getID(), eSummaryResult.getParent());
                result.addChildren(eSummaryResult.getID(), eSummaryResult.getChildren());
            }
        } catch (XMLStreamException ex) {
            LOGGER.warn("Could not parse output XML adequately... returning empty result", ex);
        }
        return result;
    }

    public class EPostResult {
        private String webEnv;
        private String queryKey;

        /**
         * @return the webEnv
         */
        public String getWebEnv() {
            return webEnv;
        }

        /**
         * @param webEnv the webEnv to set
         */
        public void setWebEnv(String webEnv) {
            this.webEnv = webEnv;
        }

        /**
         * @return the queryKey
         */
        public String getQueryKey() {
            return queryKey;
        }

        /**
         * @param queryKey the queryKey to set
         */
        public void setQueryKey(String queryKey) {
            this.queryKey = queryKey;
        }

    }
}