Source code

Java tutorial


Here is the source code for


 * Copyright 2015 Fondazione Istituto Italiano di Tecnologia.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package it.iit.genomics.cru.structures.bridges.uniprot;

import it.iit.genomics.cru.structures.bridges.commons.BridgesRemoteAccessException;
import it.iit.genomics.cru.structures.model.MoleculeEntry;
import it.iit.genomics.cru.structures.model.ChainMapping;
import it.iit.genomics.cru.structures.model.ModifiedResidue;
import it.iit.genomics.cru.structures.model.position.UniprotPosition;
import it.iit.genomics.cru.utils.maps.MapOfMap;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

 * @author Arnaud Ceol
 * Utilities to retrieve Uniprot AC and gene names from the Uniprotkb database
public class UniprotkbUtils {

    private static final Logger logger = LoggerFactory.getLogger(UniprotkbUtils.class);

    private final String taxid;

    private static final String UNIPROT_SERVER = "";

    private static final String DBFETCH_SERVER = "";

    private static final String UNIPROT_TOOL = "uniprot";

    private static final String TAXONOMY_TOOL = "taxonomy";

     * Number of time we will wait and retry in case of failure to communicate
     * with Uniprot
    private static int allowedUniprotFailures = 10;

     * Maximum number of query (gene names, uniprot acc.) sent in a single
     * request
    private final static int maxQueries = 10;

    private final MapOfMap<String, MoleculeEntry> cache = new MapOfMap<>();

    private static final HashMap<String, UniprotkbUtils> instances = new HashMap<>();

    private UniprotkbUtils(String taxid) {
        this.taxid = taxid;

     * @param taxid
     * @return
    public static UniprotkbUtils getInstance(String taxid) {
        if (instances.get(taxid) == null) {
            instances.put(taxid, new UniprotkbUtils(taxid));
        return instances.get(taxid);

    private final String USER_AGENT = "Mozilla/5.0";

    private Collection<MoleculeEntry> getUniprotEntriesXML(String location) throws BridgesRemoteAccessException {
        return getUniprotEntriesXML(location, true);

    private Collection<MoleculeEntry> getUniprotEntriesXML(String location, boolean waitAndRetryOnFailure)
            throws BridgesRemoteAccessException {

        String url = location + "&format=xml";

        ArrayList<MoleculeEntry> uniprotEntries = new ArrayList<>();
        try {
            HttpClient client = new DefaultHttpClient();
            client.getParams().setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, Boolean.TRUE);
            HttpGet request = new HttpGet(url);

            // add request header
            request.addHeader("User-Agent", USER_AGENT);

            HttpResponse response = client.execute(request);

            if (response.getEntity().getContentLength() == 0) {
                // No result
                return uniprotEntries;

            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            Document doc = dBuilder.parse(new InputSource(response.getEntity().getContent()));

            // optional, but recommended
            // read this -

            // interaction structure
            NodeList entryList = doc.getElementsByTagName("entry");

            for (int i = 0; i < entryList.getLength(); i++) {

                Element entryElement = (Element) entryList.item(i);

                String dataset = entryElement.getAttribute("dataset");

                String ac = entryElement.getElementsByTagName("accession").item(0).getFirstChild().getNodeValue();

                MoleculeEntry uniprotEntry = new MoleculeEntry(ac);


                // Taxid
                Element organism = (Element) entryElement.getElementsByTagName("organism").item(0);

                String organismCommonName = null;
                String organismScientificName = null;
                String organismOtherName = null;

                NodeList organismNames = organism.getElementsByTagName("name");

                for (int j = 0; j < organismNames.getLength(); j++) {

                    Element reference = (Element) organismNames.item(j);
                    switch (reference.getAttribute("type")) {
                    case "scientific":
                        organismScientificName = reference.getTextContent();
                    case "common":
                        organismCommonName = reference.getTextContent();
                        organismOtherName = reference.getTextContent();

                if (null != organismCommonName) {
                } else if (null != organismScientificName) {
                } else if (null != organismOtherName) {

                NodeList organismReferences = organism.getElementsByTagName("dbReference");

                for (int j = 0; j < organismReferences.getLength(); j++) {
                    Element reference = (Element) organismReferences.item(j);
                    if (reference.hasAttribute("type") && "NCBI Taxonomy".equals(reference.getAttribute("type"))) {
                        String proteinTaxid = reference.getAttribute("id");

                // GENE
                NodeList geneNames = entryElement.getElementsByTagName("gene");

                for (int j = 0; j < geneNames.getLength(); j++) {
                    Element gene = (Element) geneNames.item(j);

                    NodeList nameList = gene.getElementsByTagName("name");

                    for (int k = 0; k < nameList.getLength(); k++) {
                        Element name = (Element) nameList.item(k);

                // modified residues
                HashMap<String, ModifiedResidue> modifiedResidues = new HashMap<>();

                NodeList features = entryElement.getElementsByTagName("feature");
                for (int j = 0; j < features.getLength(); j++) {
                    Element feature = (Element) features.item(j);

                    if (false == entryElement.equals(feature.getParentNode())) {

                    // ensembl
                    if (feature.hasAttribute("type") && "modified residue".equals(feature.getAttribute("type"))) {

                        String description = feature.getAttribute("description").split(";")[0];

                        if (false == modifiedResidues.containsKey(description)) {
                            modifiedResidues.put(description, new ModifiedResidue(description));

                        NodeList locations = feature.getElementsByTagName("location");
                        for (int k = 0; k < locations.getLength(); k++) {
                            Element loc = (Element) locations.item(k);
                            NodeList positions = loc.getElementsByTagName("position");
                            for (int l = 0; l < positions.getLength(); l++) {
                                Element position = (Element) positions.item(l);
                                        new UniprotPosition(Integer.parseInt(position.getAttribute("position"))));



                // Xrefs:
                NodeList dbReferences = entryElement.getElementsByTagName("dbReference");
                for (int j = 0; j < dbReferences.getLength(); j++) {
                    Element dbReference = (Element) dbReferences.item(j);

                    if (false == entryElement.equals(dbReference.getParentNode())) {

                    NodeList molecules = dbReference.getElementsByTagName("molecule");

                    // ensembl
                    if (dbReference.hasAttribute("type") && "Ensembl".equals(dbReference.getAttribute("type"))) {

                        // transcript ID
                        String id = dbReference.getAttribute("id");

                        for (int iMolecule = 0; iMolecule < molecules.getLength(); iMolecule++) {
                            Element molecule = (Element) molecules.item(iMolecule);
                            uniprotEntry.addXrefToVarSplice(id, molecule.getAttribute("id"));


                        NodeList properties = dbReference.getElementsByTagName("property");

                        for (int k = 0; k < properties.getLength(); k++) {
                            Element property = (Element) properties.item(k);

                            if (property.hasAttribute("type") && "gene ID".equals(property.getAttribute("type"))) {

                    // refseq
                    if (dbReference.hasAttribute("type") && "RefSeq".equals(dbReference.getAttribute("type"))) {
                        NodeList properties = dbReference.getElementsByTagName("property");
                        for (int k = 0; k < properties.getLength(); k++) {
                            Element property = (Element) properties.item(k);
                            if (property.hasAttribute("type")
                                    && "nucleotide sequence ID".equals(property.getAttribute("type"))) {

                                String id = property.getAttribute("value");
                                if (molecules.getLength() > 0) {
                                    for (int iMolecule = 0; iMolecule < molecules.getLength(); iMolecule++) {
                                        Element molecule = (Element) molecules.item(iMolecule);

                                        // If refseq, add also without the version                                       
                                        uniprotEntry.addXrefToVarSplice(id, molecule.getAttribute("id"));

                                } else {
                                    // If refseq, add also without the version                                       
                                    uniprotEntry.addXrefToVarSplice(id, ac);
                                    uniprotEntry.addXrefToVarSplice(id.split("\\.")[0], ac);



                    /* PDB chains will be imported from the webservice */
                    // PDB
                    if (dbReference.hasAttribute("type") && "PDB".equals(dbReference.getAttribute("type"))) {
                        NodeList properties = dbReference.getElementsByTagName("property");
                        String method = null;
                        String chains = null;

                        for (int k = 0; k < properties.getLength(); k++) {
                            Element property = (Element) properties.item(k);
                            if (property.hasAttribute("type") && "method".equals(property.getAttribute("type"))) {
                                method = property.getAttribute("value");
                            } else if (property.hasAttribute("type")
                                    && "chains".equals(property.getAttribute("type"))) {
                                chains = property.getAttribute("value");

                        if (method != null && "Model".equals(method)) {

                        if (chains == null) {

                        String pdb = dbReference.getAttribute("id");

                        uniprotEntry.addPDB(pdb, method);

                        for (String chainElement : chains.split(",")) {
                            try {
                                String chainNames = chainElement.split("=")[0];
                                int start = Integer.parseInt(chainElement.split("=")[1].trim().split("-")[0]);
                                int end = Integer
                                        .parseInt(chainElement.split("=")[1].trim().split("-")[1].replace(".", ""));
                                for (String chainName : chainNames.split("/")) {
                                    uniprotEntry.addChain(pdb, new ChainMapping(pdb, chainName.trim(), start, end),
                            } catch (ArrayIndexOutOfBoundsException aiobe) {
                                // IGBLogger.getInstance().warning(
                                // "Cannot parse chain: " + chainElement
                                // + ", skip");


                // Sequence
                NodeList sequenceElements = entryElement.getElementsByTagName("sequence");

                for (int j = 0; j < sequenceElements.getLength(); j++) {
                    Element sequenceElement = (Element) sequenceElements.item(j);

                    if (false == sequenceElement.getParentNode().equals(entryElement)) {
                    String sequence = sequenceElement.getFirstChild().getNodeValue().replaceAll("\n", "");

                // Diseases
                NodeList diseases = entryElement.getElementsByTagName("disease");

                for (int j = 0; j < diseases.getLength(); j++) {
                    Element disease = (Element) diseases.item(j);

                    NodeList nameList = disease.getElementsByTagName("name");

                    for (int k = 0; k < nameList.getLength(); k++) {
                        Element name = (Element) nameList.item(k);

                // Get fasta for all varsplice
                String fastaQuery = "" + uniprotEntry.getUniprotAc()
                        + ".fasta?include=yes";

                try {
                    //HttpClient fastaClient = new DefaultHttpClient();

                    client.getParams().setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, Boolean.TRUE);
                    HttpGet fastaRequest = new HttpGet(fastaQuery);

                    // add request header
                    request.addHeader("User-Agent", USER_AGENT);

                    HttpResponse fastaResponse = client.execute(fastaRequest);

                    if (fastaResponse.getEntity().getContentLength() == 0) {

                    InputStream is = fastaResponse.getEntity().getContent();

                    try {
                        LinkedHashMap<String, ProteinSequence> fasta = FastaReaderHelper

                        boolean mainSequence = true;

                        for (ProteinSequence seq : fasta.values()) {
                            //                  "Add sequence: " + seq.getAccession().getID() + " : " + seq.getSequenceAsString());
                            uniprotEntry.addSequence(seq.getAccession().getID(), seq.getSequenceAsString());
                            if (mainSequence) {
                                mainSequence = false;
                    } catch (Exception e) {
                        logger.error("Cannot retrieve fasta for : " + uniprotEntry.getUniprotAc());
                } catch (IOException | IllegalStateException ex) {
                    logger.error(null, ex);



        } catch (SAXParseException se) {
            // Nothing was return
            // IGBLogger.getInstance()
            // .error("Uniprot returns empty result: " + url);
        } catch (IOException | ParserConfigurationException | IllegalStateException | SAXException | DOMException
                | NumberFormatException e) {
            if (waitAndRetryOnFailure && allowedUniprotFailures > 0) {
                try {
                    return getUniprotEntriesXML(location, false);
                } catch (InterruptedException e1) {
                    logger.error("Fail to retrieve data from " + location);
                    throw new BridgesRemoteAccessException("Fail to retrieve data from Uniprot " + location);
            } else {
                logger.error("Problem with Uniprot: " + url);
                throw new BridgesRemoteAccessException("Fail to retrieve data from Uniprot " + location);

        for (MoleculeEntry entry : uniprotEntries) {

        return uniprotEntries;

    private void addToCache(MoleculeEntry protein) {

        // add to chache by uniprotAc, gene name, refseq ..
        // Only use the first one. Using synomyms may cause ambiguity.
        String geneName = protein.getGeneName();

        if (geneName != null) {
            cache.add(geneName.toUpperCase(), protein);

        for (String xref : protein.getRefseqs()) {
            // remove version
            cache.add(xref.toUpperCase().split("\\.")[0], protein);

        cache.add(protein.getUniprotAc(), protein);


     * @param genes
     * @return
     * @throws BridgesRemoteAccessException
    public MapOfMap<String, MoleculeEntry> getUniprotEntriesFromGenes(Collection<String> genes)
            throws BridgesRemoteAccessException {
        String tool = UNIPROT_TOOL;

        MapOfMap<String, MoleculeEntry> gene2uniprots = new MapOfMap<>(genes);

        HashSet<String> genes2get = new HashSet<>();

        try {
            for (String gene : genes) {
                if (cache.containsKey(gene.toUpperCase())) {
                    gene2uniprots.addAll(gene, cache.get(gene.toUpperCase()));
                } else {

                    // if size == limit, do query
                    if (genes2get.size() == maxQueries) {
                        String location = UNIPROT_SERVER + tool + "/?" + "query=keyword:181+AND+organism:"
                                + URLEncoder.encode("\"" + taxid + "\"", "UTF-8") + "+AND+(gene:"
                                + URLEncoder.encode(StringUtils.join(genes2get, " OR gene:"), "UTF-8") + ")";

                        Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

                        for (MoleculeEntry entry : uniprotEntries) {
                            String geneName = entry.getGeneName();

                            // Only use the first one. Using synomyms may cause
                            // ambiguity.
                            if (geneName != null && gene2uniprots.containsKey(geneName)) {
                                gene2uniprots.add(geneName, entry);



            if (genes2get.isEmpty()) {
                return gene2uniprots;

            String location = UNIPROT_SERVER + tool + "/?" + "query=keyword:181+AND+organism:"
                    + URLEncoder.encode("\"" + taxid + "\"", "UTF-8") + "+AND+(gene:"
                    + URLEncoder.encode(StringUtils.join(genes2get, " OR gene:"), "UTF-8") + ")";

            Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

            for (MoleculeEntry entry : uniprotEntries) {
                String geneName = entry.getGeneName();

                // Only use the first one. Using synomyms may cause ambiguity.
                if (geneName != null && gene2uniprots.containsKey(geneName)) {
                    gene2uniprots.add(geneName, entry);

        } catch (UnsupportedEncodingException e) {
            logger.error("cannot get proteins for " + StringUtils.join(genes, ", "), e);

        return gene2uniprots;

     * @param refSeqs
     * @return
     * @throws BridgesRemoteAccessException
    public MapOfMap<String, MoleculeEntry> getUniprotEntriesFromRefSeqs(Collection<String> refSeqs)
            throws BridgesRemoteAccessException {
        String tool = UNIPROT_TOOL;

        MapOfMap<String, MoleculeEntry> refseq2uniprots = new MapOfMap<>(refSeqs);

        if (refSeqs.isEmpty()) {
            return refseq2uniprots;

        HashSet<String> refs2get = new HashSet<>();

        try {
            for (String refseq : refSeqs) {
                if (cache.containsKey(refseq.toUpperCase().split("\\.")[0])) {
                    refseq2uniprots.addAll(refseq, cache.get(refseq.toUpperCase().split("\\.")[0]));
                } else {

                    // if size == limit, do query
                    if (refs2get.size() == maxQueries) {

                        String location = UNIPROT_SERVER + tool + "/?" + "query=keyword:181+AND+organism:"
                                + URLEncoder.encode("\"" + taxid + "\"", "UTF-8")
                                + "+AND+(database%3A(type%3Arefseq+"
                                + URLEncoder.encode(
                                        "" + StringUtils.join(refs2get, ") OR database:(type:refseq ") + "",
                                + "))";

                        Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

                        for (MoleculeEntry entry : uniprotEntries) {
                            for (String xref : entry.getRefseqs()) {
                                if (xref.endsWith(".")) {
                                    xref = xref.substring(0, xref.length() - 1);
                                if (refseq2uniprots.containsKey(xref.trim())) {
                                    refseq2uniprots.add(xref, entry);
                                } else if (refseq2uniprots.containsKey(xref.split("[.]")[0])) {
                                    refseq2uniprots.add(xref.split("[.]")[0], entry);


            if (refs2get.isEmpty()) {
                return refseq2uniprots;

            String location = UNIPROT_SERVER + tool + "/?" + "query=keyword:181+AND+organism:"
                    + URLEncoder.encode("\"" + taxid + "\"", "UTF-8") + "+AND+(database%3A(type%3Arefseq+"
                    + URLEncoder.encode("" + StringUtils.join(refs2get, ") OR database:(type:refseq ") + "",
                    + "))";

            Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

            for (MoleculeEntry entry : uniprotEntries) {
                for (String xref : entry.getRefseqs()) {
                    if (xref.endsWith(".")) {
                        xref = xref.substring(0, xref.length() - 1);
                    if (refseq2uniprots.containsKey(xref.trim())) {
                        refseq2uniprots.add(xref, entry);
                    } else if (refseq2uniprots.containsKey(xref.split("[.]")[0])) {
                        refseq2uniprots.add(xref.split("[.]")[0], entry);

        } catch (UnsupportedEncodingException e) {
            logger.error("cannot get proteins for " + StringUtils.join(refSeqs, ", "), e);

        return refseq2uniprots;

     * @param ensemblGeneIDs
     * @return
     * @throws BridgesRemoteAccessException
    public MapOfMap<String, MoleculeEntry> getUniprotEntriesFromEnsembl(Collection<String> ensemblGeneIDs)
            throws BridgesRemoteAccessException {
        String tool = UNIPROT_TOOL;

        MapOfMap<String, MoleculeEntry> ensembl2uniprots = new MapOfMap<>(ensemblGeneIDs);

        if (ensemblGeneIDs.isEmpty()) {
            return ensembl2uniprots;

        HashSet<String> refs2get = new HashSet<>();

        try {

            for (String ensemblGeneID : ensemblGeneIDs) {
                if (cache.containsKey(ensemblGeneID.toUpperCase().split("\\.")[0])) {
                    ensembl2uniprots.addAll(ensemblGeneID, cache.get(ensemblGeneID.toUpperCase().split("\\.")[0]));
                } else {

                    // if size == limit, do query
                    if (refs2get.size() == maxQueries) {
                        String location = UNIPROT_SERVER + tool + "/?" + "query=keyword:181+AND+organism:"
                                + URLEncoder.encode("\"" + taxid + "\"", "UTF-8")
                                + "+AND+(database%3A(type%3Aensembl+"
                                + URLEncoder.encode(
                                        "" + StringUtils.join(refs2get, ") OR database:(type:ensembl ") + "",
                                + "))";

                        Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

                        for (MoleculeEntry entry : uniprotEntries) {
                            for (String xref : entry.getEnsemblGenes()) {
                                if (xref.endsWith(".")) {
                                    xref = xref.substring(0, xref.length() - 1);
                                if (ensembl2uniprots.containsKey(xref.trim())) {
                                    ensembl2uniprots.add(xref, entry);
                                } else if (ensembl2uniprots.containsKey(xref.split("[.]")[0])) {
                                    ensembl2uniprots.add(xref.split("[.]")[0], entry);

            if (refs2get.isEmpty()) {
                return ensembl2uniprots;

            String location = UNIPROT_SERVER + tool + "/?" + "query=keyword:181+AND+organism:"
                    + URLEncoder.encode("\"" + taxid + "\"", "UTF-8") + "+AND+(database%3A(type%3Aensembl+"
                    + URLEncoder.encode("" + StringUtils.join(refs2get, ") OR database:(type:ensembl ") + "",
                    + "))";

            Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

            for (MoleculeEntry entry : uniprotEntries) {
                for (String xref : entry.getEnsemblGenes()) {
                    if (xref.endsWith(".")) {
                        xref = xref.substring(0, xref.length() - 1);
                    if (ensembl2uniprots.containsKey(xref.trim())) {
                        ensembl2uniprots.add(xref, entry);
                    } else if (ensembl2uniprots.containsKey(xref.split("[.]")[0])) {
                        ensembl2uniprots.add(xref.split("[.]")[0], entry);

        } catch (UnsupportedEncodingException e) {
            logger.error("cannot get proteins for " + StringUtils.join(ensemblGeneIDs, ", "), e);

        return ensembl2uniprots;

     * @param xrefs
     * @return
     * @throws BridgesRemoteAccessException
    public HashMap<String, MoleculeEntry> getUniprotEntriesFromUniprotAccessions(Collection<String> xrefs)
            throws BridgesRemoteAccessException {
        return getUniprotEntriesFromUniprotAccessions(xrefs, true);

     * @param uniprotAc
     * @param filterTaxid
     * @return
     * @throws BridgesRemoteAccessException
    public MoleculeEntry getUniprotEntriesFromUniprotAccession(String uniprotAc, boolean filterTaxid)
            throws BridgesRemoteAccessException {

        Collection<String> acs = new ArrayList<>();

        return getUniprotEntriesFromUniprotAccessions(acs, filterTaxid).get(uniprotAc);

     * @param xrefs
     * @param filterTaxid
     * @return
     * @throws BridgesRemoteAccessException
    public HashMap<String, MoleculeEntry> getUniprotEntriesFromUniprotAccessions(Collection<String> xrefs,
            boolean filterTaxid) throws BridgesRemoteAccessException {
        String tool = UNIPROT_TOOL;

        // remove xrefs that are not uniprotAcs
        Collection<String> uniprotAcs = getUniprotAcs(xrefs);

        HashMap<String, MoleculeEntry> results = new HashMap<>();

        HashSet<String> ref2get = new HashSet<>();

        try {
            for (String ref : uniprotAcs) {
                if (cache.containsKey(ref.toUpperCase())) {
                    results.put(ref, cache.get(ref.toUpperCase()).iterator().next());
                } else {

                    // if size == limit, do query
                    if (ref2get.size() == maxQueries) {
                        String location = UNIPROT_SERVER + tool + "/?" + "query=(accession:"
                                + URLEncoder.encode(StringUtils.join(ref2get, " OR accession:") + "", "UTF-8")
                                + ")";
                        if (filterTaxid) {
                            location += "+AND+keyword:181+AND+organism:"
                                    + URLEncoder.encode("\"" + taxid + "\"", "UTF-8");

                        Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

                        for (MoleculeEntry entry : uniprotEntries) {
                            results.put(entry.getUniprotAc(), entry);

            if (ref2get.isEmpty()) {
                return results;

            String location = UNIPROT_SERVER + tool + "/?" + "query=(accession:"
                    + URLEncoder.encode(StringUtils.join(ref2get, " OR accession:") + "", "UTF-8") + ")";
            if (filterTaxid) {
                location += "+AND+keyword:181+AND+organism:" + URLEncoder.encode("\"" + taxid + "\"", "UTF-8");
            Collection<MoleculeEntry> uniprotEntries = getUniprotEntriesXML(location);

            for (MoleculeEntry entry : uniprotEntries) {
                results.put(entry.getUniprotAc(), entry);

        } catch (UnsupportedEncodingException e) {
            logger.error("cannot get proteins for " + StringUtils.join(xrefs, ", "), e);

        return results;

    private static final String[][] DEFAULT_SPECIES = { { "Homo sapiens", "9606" }, { "Mus musculus", "10090" } };

     * @param name
     * @return
     * @throws BridgesRemoteAccessException
    public static ArrayList<String[]> getSpeciesFromName(String name) throws BridgesRemoteAccessException {

        ArrayList<String[]> results = new ArrayList<>();

        // Search first defaults taxid to avoid a remote connection to uniprot:
        for (String[] species : DEFAULT_SPECIES) {
            if (species[0].equals(name)) {
                return results;

        String tool = TAXONOMY_TOOL;

        try {
            String url = UNIPROT_SERVER + tool + "/?" + "query=complete:yes+AND+("
                    + URLEncoder.encode(name, "UTF-8") + ")&format=tab";

            HttpClient client = new DefaultHttpClient();
            HttpGet request = new HttpGet(url);

            HttpResponse response = client.execute(request);

            BufferedReader rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));

            String line;

            while ((line = rd.readLine()) != null) {

                if (false == line.startsWith("Taxon")) {
                    String[] fields = line.split("\t");
                    if (fields.length >= 3) {
                        String[] specie = { fields[2], fields[0] };

        } catch (IOException | IllegalStateException e) {
            logger.error("cannot get species for " + name, e);
            throw new BridgesRemoteAccessException("Fail to retrieve species " + name + " from Uniprot ");

        return results;

     * Check syntax of a cross reference and verify it is a Uniprot Ac (without
     * isoform)
     * @param xref
     * @return
    public static boolean isUniprotAc(String xref) {
        return xref.matches("[A-Za-z0-9]{6}(\\-[0-9]+)?");

     * @param xref
     * @return
    public static boolean isChebiAc(String xref) {
        return xref.matches("CHEBI:[0-9]+");

    private static Collection<String> getUniprotAcs(Collection<String> xrefs) {
        HashSet<String> uniprotAcs = new HashSet<>();

        for (String xref : xrefs) {
            if (xref.matches(".*\\-[0-9]+")) {
                xref = xref.split("-")[0];

            if (isUniprotAc(xref)) {
        return uniprotAcs;

     * @param args
     * @throws Exception
    public static void main(String[] args) throws Exception {

        ArrayList<String> acs = new ArrayList<>();


        HashMap<String, MoleculeEntry> prots = UniprotkbUtils.getInstance("9606")

        for (MoleculeEntry entry : prots.values()) {
            for (String pdb : entry.getPdbs()) {
                System.out.println("# " + pdb);
                for (ChainMapping chain : entry.getChains(pdb)) {
                    System.out.println("- " + pdb + ": " + chain.getChain());
            System.out.println("Diseases: " + StringUtils.join(entry.getDiseases(), ", "));

     * @param pdo
     * @return
    public Collection<String> getProteinsInStructures(String pdo) {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
