ubic.gemma.loader.protein.StringProteinProteinInteractionConverter.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.protein.StringProteinProteinInteractionConverter.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2010 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.protein;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.gemma.loader.protein.biomart.model.Ensembl2NcbiValueObject;
import ubic.gemma.loader.protein.string.model.StringProteinProteinInteraction;
import ubic.gemma.loader.util.converter.Converter;
import ubic.gemma.model.association.Gene2GeneProteinAssociation;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.util.ConfigUtils;

/**
 * Class that is responsible for converting value objects generated from the parsing of STRING files
 * (StringProteinProteinInteraction) into Gemma Gene2GeneProteinAssociations. To do that it refers to a map ensembl2ncbi
 * ids
 * 
 * @author ldonnison
 * @version $Id: StringProteinProteinInteractionConverter.java,v 1.6 2012/06/05 18:33:19 paul Exp $
 */
public class StringProteinProteinInteractionConverter implements Converter<Object, Object> {

    /** String url **/
    private static String stringUrl;

    /** Version of string being used */
    private static String stringVersion;

    /** The key is the ensembl protein id. */
    private Map<String, Ensembl2NcbiValueObject> ensembl2ncbi = null;

    /** Reference to external database as held in gemma system */
    private ExternalDatabase stringExternalDatabase;

    private static Log log = LogFactory.getLog(StringProteinProteinInteractionConverter.class);

    /** The joining string between two protein ids to create the url link in string for the interaction */
    private static final String PROTEIN2PROTEINLINK = "%0D";

    AtomicBoolean producerDone = new AtomicBoolean(false);

    /**
     * @param ensembl2ncbi Map of ensembl peptide ids to entrez/ncbi id genes.
     */
    public StringProteinProteinInteractionConverter(Map<String, Ensembl2NcbiValueObject> ensembl2ncbi) {
        this.ensembl2ncbi = ensembl2ncbi;

        stringVersion = ConfigUtils.getString("protein.string.version");
        stringUrl = ConfigUtils.getString("protein.string.linksurl");
        if (stringUrl == null || stringUrl.length() == 0)
            throw new RuntimeException(new ConfigurationException("stringUrl was null or empty"));
        if (stringVersion == null || stringVersion.length() == 0)
            throw new RuntimeException(new ConfigurationException("stringVersion was null or empty"));
    }

    @Override
    public Collection<Object> convert(Collection<? extends Object> sourceDomainObjects) {
        long startTime = System.currentTimeMillis();
        Collection<Object> results = new HashSet<Object>();
        for (Object object : sourceDomainObjects) {
            results.add(this.convert(object));
        }
        long EndTime = System.currentTimeMillis();
        long time = (EndTime - startTime) / 1000;
        log.info("Time taken for conversion call is  " + time);
        return results;
    }

    /**
     * Standard converter code
     * 
     * @see ubic.gemma.loader.loaderutils.Converter#convert(java.lang.Object)
     */
    @Override
    public Object convert(Object sourceDomainObject) {

        Object processedObject = null;
        if (sourceDomainObject instanceof Collection) {
            processedObject = this.convert((Collection<?>) sourceDomainObject);
        } else if (sourceDomainObject instanceof StringProteinProteinInteraction) {
            StringProteinProteinInteraction stringProteinProteinInteraction = (StringProteinProteinInteraction) sourceDomainObject;
            processedObject = convert(stringProteinProteinInteraction);
        } else {
            throw new RuntimeException("Incorrect domain object passed");
        }

        return processedObject;

    }

    /**
     * Given a StringProteinProteinInteraction value object create a gemma Gene2GeneProteinAssociation. One
     * StringProteinProteinInteraction can potentially create many Gene2GeneProteinAssociation objects If the call to
     * getNcbiGene returns more than 1 gene then each gene returned is turned into an interaction. Which means that the
     * same ensemble protein protein id interaction could be duplicated as many times as there is gene mappings. This is
     * done for both protein 1 and protein2 so a matrix is formed.
     * 
     * @param sourceDomainObject the domain object to process
     * @return collection of Gene2GeneProteinAssociation representing this interaction
     */
    public Collection<Gene2GeneProteinAssociation> convert(StringProteinProteinInteraction sourceDomainObject) {

        Collection<Gene2GeneProteinAssociation> gene2GeneProteinAssociations = new ArrayList<Gene2GeneProteinAssociation>();

        // if(sourceDomainObject instanceof StringProteinProteinInteraction){
        StringProteinProteinInteraction stringProteinProteinInteraction = sourceDomainObject;

        // have to create a matrix of interactions take the ensemble id and see how many ncbi ids it maps to
        Collection<Gene> genesForProteinOne = this.getNcbiGene(stringProteinProteinInteraction.getProtein1());
        Collection<Gene> genesForProteinTwo = this.getNcbiGene(stringProteinProteinInteraction.getProtein2());

        // empty if no mapping found
        if (genesForProteinOne.isEmpty()) {
            log.warn("No ncbi gene mapping for protein 1 " + stringProteinProteinInteraction.getProtein1());
        } else if (genesForProteinTwo.isEmpty()) {
            log.warn("No ncbi gene mapping for protein 2 " + stringProteinProteinInteraction.getProtein2());
        } else {
            // create the one to many mapping from ensembl to ncbi/entrez
            for (Gene geneProtein1 : genesForProteinOne) {
                for (Gene geneProtein2 : genesForProteinTwo) {
                    Gene2GeneProteinAssociation gene2GeneProteinAssociation = Gene2GeneProteinAssociation.Factory
                            .newInstance();
                    gene2GeneProteinAssociation
                            .setDatabaseEntry(this.getDataBaseEntry(stringProteinProteinInteraction));
                    gene2GeneProteinAssociation
                            .setConfidenceScore(stringProteinProteinInteraction.getCombined_score());
                    gene2GeneProteinAssociation
                            .setEvidenceVector(stringProteinProteinInteraction.getEvidenceVector());
                    gene2GeneProteinAssociation.setFirstGene(geneProtein1);
                    gene2GeneProteinAssociation.setSecondGene(geneProtein2);
                    gene2GeneProteinAssociations.add(gene2GeneProteinAssociation);
                }
            }
        }
        return gene2GeneProteinAssociations;
    }

    /**
     * One ensemblProteinID can map to multiple ncbi genes. This method takes the ensembl gene and creates a collection
     * of entrez ncbi genes. It first has to remove the taxon id from the beginning of the peptide id as given by
     * string.
     * 
     * @param ensemblProteinId The ensembl protein id in this interaction
     * @return Collection of genes as represented in ncbi entrez gene
     */
    public Collection<Gene> getNcbiGene(String ensemblProteinId) {
        // log.debug("getting ncbi gene for ensembl id " + ensemblProteinId);
        Collection<Gene> genes = new ArrayList<Gene>();

        // in case species id is still on there from STRING like 12334.ENSD....
        String eid = ensemblProteinId.replaceFirst("[0-9]+\\.", "");

        Ensembl2NcbiValueObject e2n = ensembl2ncbi.get(eid);
        if (e2n == null || e2n.getEntrezgenes().isEmpty()) {
            return genes;
        }

        String ensemblGeneId = e2n.getEnsemblGeneId();

        Collection<String> entreGeneids = (e2n.getEntrezgenes());
        for (String entrezGeneid : entreGeneids) {
            if (!entrezGeneid.isEmpty()) {
                Gene gene = Gene.Factory.newInstance();
                gene.setNcbiGeneId(Integer.parseInt(entrezGeneid));
                gene.setEnsemblId(ensemblGeneId);
                genes.add(gene);
                if (log.isDebugEnabled())
                    log.debug("Entry found for entrezGeneid " + entrezGeneid);
            }
        }

        return genes;
    }

    /**
     * Threaded conversion of domain objects to Gemma objects.
     */
    public void convert(final BlockingQueue<Gene2GeneProteinAssociation> gene2GeneProteinAssociationQueue,
            final Collection<StringProteinProteinInteraction> stringProteinProteinInteractions) {
        // start up thread to convert a member of geneInfoQueue to a gene/geneproduct/databaseentry
        // then push the gene onto the geneQueue for loading
        Thread convertThread = new Thread(new Runnable() {
            @Override
            @SuppressWarnings("synthetic-access")
            public void run() {

                try {
                    for (StringProteinProteinInteraction stringProteinProteinInteraction : stringProteinProteinInteractions) {
                        if (stringProteinProteinInteraction == null) {
                            continue;
                        }
                        // converter
                        Collection<Gene2GeneProteinAssociation> dataColl = convert(stringProteinProteinInteraction);
                        // this returns a collection so split out and put on queue
                        for (Gene2GeneProteinAssociation gene2GeneProteinAssociation : dataColl) {
                            gene2GeneProteinAssociationQueue.put(gene2GeneProteinAssociation);
                        }
                    }
                } catch (InterruptedException e) {
                    log.info("Interrupted.");
                }
                producerDone.set(true);
            }

        }, "Converter");

        convertThread.start();
    }

    /**
     * Create a database entry which represents the external record as held in string
     * 
     * @param StringProteinProteinInteraction object which contains the two protein ids
     * @return DatabaseEntry representing the record as held in string
     */
    public DatabaseEntry getDataBaseEntry(StringProteinProteinInteraction stringProteinProteinInteractionId) {
        String proteinProteinInteraction = this.getProteinProteinInteractionId(stringProteinProteinInteractionId);
        DatabaseEntry databaseEntry = DatabaseEntry.Factory.newInstance(proteinProteinInteraction, stringVersion,
                stringUrl, stringExternalDatabase);
        return databaseEntry;
    }

    /**
     * This is a made up value for the accessionId which is the protein peptide id 1 and the protein peptide 2 combined
     * and separated by a percentage This is so that it can be sent as a whole to string to retrieve the record in
     * string
     * 
     * @param protein1 Protein 1 id in the interaction
     * @param protein2 Protein 2 in the interaction
     * @return Combined protein 1 and protein 2 ids representing an identifier for this protein interaction
     */
    public String getProteinProteinInteractionId(StringProteinProteinInteraction stringProteinProteinInteraction) {
        return stringProteinProteinInteraction.getProtein1().concat(PROTEIN2PROTEINLINK)
                .concat(stringProteinProteinInteraction.getProtein2());
    }

    /**
     * Set the map of ids
     * 
     * @param bioMartStringEntreGeneMapping
     */
    public void setEnsemblEntrzMap(Map<String, Ensembl2NcbiValueObject> bioMartStringEntreGeneMapping) {
        this.ensembl2ncbi = bioMartStringEntreGeneMapping;
    }

    /**
     * @return the stringExternalDatabase
     */
    public ExternalDatabase getStringExternalDatabase() {
        return stringExternalDatabase;
    }

    /**
     * @return the stringExternalDatabase
     */
    public void setStringExternalDatabase(ExternalDatabase externalDatabase) {
        this.stringExternalDatabase = externalDatabase;
    }

    public boolean isProducerDone() {
        return this.producerDone.get();
    }

    public void setProducerDoneFlag(AtomicBoolean flag) {
        this.producerDone = flag;
    }

}