ubic.gemma.core.loader.genome.gene.ncbi.NcbiGeneConverter.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.genome.gene.ncbi.NcbiGeneConverter.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006-2012 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.genome.gene.ncbi;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession;
import ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo;
import ubic.gemma.core.loader.util.converter.Converter;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.genome.Chromosome;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.PhysicalLocation;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.model.genome.gene.GeneAlias;
import ubic.gemma.model.genome.gene.GeneProduct;
import ubic.gemma.persistence.util.SequenceBinUtils;

import java.util.Collection;
import java.util.HashSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Convert NCBIGene2Accession objects into Gemma Gene objects with associated GeneProducts. Genes without products are
 * ignored.
 *
 * @author pavlidis
 * @author jrsantos
 */
@SuppressWarnings({ "WeakerAccess", "unused" }) // Possible external use
public class NcbiGeneConverter implements Converter<Object, Object> {

    private static final Log log = LogFactory.getLog(NcbiGeneConverter.class.getName());
    private static final ExternalDatabase genBank;
    private static final ExternalDatabase ensembl;

    static {
        genBank = ExternalDatabase.Factory.newInstance();
        NcbiGeneConverter.genBank.setName("Genbank");
        ensembl = ExternalDatabase.Factory.newInstance();
        NcbiGeneConverter.ensembl.setName("Ensembl");
    }

    AtomicBoolean producerDone = new AtomicBoolean(false);
    AtomicBoolean sourceDone = new AtomicBoolean(false);

    /**
     * @return the genBank
     */
    public static ExternalDatabase getGenbank() {
        return NcbiGeneConverter.genBank;
    }

    /**
     * @return the ensembl
     */
    public static ExternalDatabase getEnsembl() {
        return NcbiGeneConverter.ensembl;
    }

    @Override
    public Collection<Object> convert(Collection<?> sourceDomainObjects) {
        Collection<Object> results = new HashSet<>();
        for (Object object : sourceDomainObjects) {
            results.add(this.convert(object));
        }
        return results;
    }

    @Override
    @SuppressWarnings("unchecked")
    public Object convert(Object sourceDomainObject) {
        if (sourceDomainObject instanceof Collection) {
            return this.convert((Collection<Object>) sourceDomainObject);
        }
        assert sourceDomainObject instanceof NCBIGene2Accession;
        NCBIGene2Accession ncbiGene = (NCBIGene2Accession) sourceDomainObject;
        return this.convert(ncbiGene.getInfo());
    }

    public Gene convert(NCBIGeneInfo info) {
        Gene gene = Gene.Factory.newInstance();

        gene.setNcbiGeneId(Integer.parseInt(info.getGeneId()));
        gene.setName(info.getDefaultSymbol());
        gene.setOfficialSymbol(info.getDefaultSymbol());
        gene.setOfficialName(info.getDescription());
        gene.setEnsemblId(info.getEnsemblId());

        /*
         * NOTE we allow multiple discontinued or previous ids, separated by commas. This is a hack to account for cases
         * uncovered recently...can be minimized by running this regularly.
         */
        if (info.getHistory() != null) {
            assert info.getHistory().getCurrentId() == null
                    || info.getGeneId().equals(info.getHistory().getCurrentId());

            assert info.getHistory().getPreviousIds() != null;
            if (!info.getHistory().getPreviousIds().isEmpty()) {
                String previousIds = StringUtils.join(info.getHistory().getPreviousIds(), ",");
                gene.setPreviousNcbiId(previousIds);
            }

        } else if (StringUtils.isNotBlank(info.getDiscontinuedId())) {
            if (NcbiGeneConverter.log.isDebugEnabled())
                NcbiGeneConverter.log.debug("Gene matches a gene that was discontinued: " + gene
                        + " matches gene that had id " + info.getDiscontinuedId());
            gene.setPreviousNcbiId(info.getDiscontinuedId());
        }

        gene.setDescription("Imported from NCBI gene; Nomenclature status: " + info.getNomenclatureStatus());

        Taxon t = Taxon.Factory.newInstance();
        t.setNcbiId(info.getTaxId());
        t.setIsGenesUsable(false);
        gene.setTaxon(t);

        /*
         * We are going to stop maintaining this information
         */
        PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
        Chromosome chrom = new Chromosome(info.getChromosome(), t);
        pl.setChromosome(chrom);

        gene.setPhysicalLocation(pl);

        Collection<GeneAlias> aliases = gene.getAliases();
        for (String alias : info.getSynonyms()) {
            GeneAlias newAlias = GeneAlias.Factory.newInstance();
            newAlias.setAlias(alias);
            aliases.add(newAlias);
        }

        for (String dbname : info.getDbXrefs().keySet()) {
            if (!dbname.equalsIgnoreCase("Ensembl"))
                continue;
            String identifier = info.getDbXrefs().get(dbname);
            DatabaseEntry crossref = DatabaseEntry.Factory.newInstance();
            crossref.setAccession(identifier);
            crossref.setExternalDatabase(NcbiGeneConverter.getEnsembl());
            gene.getAccessions().add(crossref);
        }

        return gene;

    }

    public Collection<GeneProduct> convert(NCBIGene2Accession acc, Gene gene) {
        Collection<GeneProduct> geneProducts = new HashSet<>();
        // initialize up to two Gene Products

        // RNA section
        if (acc.getRnaNucleotideAccession() != null) {
            GeneProduct rna = GeneProduct.Factory.newInstance();

            // set available fields
            rna.setNcbiGi(acc.getRnaNucleotideGI());
            rna.setGene(gene);
            rna.setName(acc.getRnaNucleotideAccession());

            String description = "Imported from NCBI Gene";

            if (acc.getStatus() != null) {
                description = description + " (Refseq status: " + acc.getStatus() + ").";
            }

            if (acc.getRnaNucleotideAccession() != null) {
                DatabaseEntry accession = DatabaseEntry.Factory.newInstance();
                accession.setAccession(acc.getRnaNucleotideAccession());
                accession.setAccessionVersion(acc.getRnaNucleotideAccessionVersion());
                accession.setExternalDatabase(NcbiGeneConverter.genBank);
                if (rna.getAccessions() == null) {
                    rna.setAccessions(new HashSet<DatabaseEntry>());
                }
                rna.getAccessions().add(accession);
            }

            /*
             * Fill in physical location details.
             */
            if (acc.getGenomicNucleotideAccession() != null && gene.getPhysicalLocation() != null) {
                this.getChromosomeDetails(acc, gene);
                PhysicalLocation pl = this.getPhysicalLocation(acc, gene);
                rna.setPhysicalLocation(pl);
            }

            rna.setDescription(description);
            geneProducts.add(rna);
        }

        return geneProducts;
    }

    public Gene convert(NcbiGeneData data) {
        // get gene info and fill in gene
        NCBIGeneInfo geneInfo = data.getGeneInfo();
        Gene gene = this.convert(geneInfo);

        // grab all accessions and fill in GeneProduct/DatabaseEntry
        // and associate with Gene
        Collection<NCBIGene2Accession> gene2accession = data.getAccessions();
        Collection<GeneProduct> geneProducts = new HashSet<>();

        for (NCBIGene2Accession acc : gene2accession) {
            geneProducts.addAll(this.convert(acc, gene));
        }
        gene.setProducts(geneProducts);

        return gene;
    }

    /*
     * Threaded conversion of domain objects to Gemma objects.
     */
    public void convert(final BlockingQueue<NcbiGeneData> geneInfoQueue, final BlockingQueue<Gene> geneQueue) {
        // start up thread to convert a member of geneInfoQueue to a gene/geneproduct/databaseentry
        // then push the gene onto the geneQueue for loading

        Thread convertThread = new Thread(new Runnable() {
            @Override
            @SuppressWarnings("synthetic-access")
            public void run() {
                while (!(sourceDone.get() && geneInfoQueue.isEmpty())) {
                    try {
                        NcbiGeneData data = geneInfoQueue.poll();
                        if (data == null) {
                            continue;
                        }
                        Gene converted = NcbiGeneConverter.this.convert(data);

                        if (converted.getProducts().isEmpty()) {
                            log.info("Gene with no products skipped: " + converted);
                            continue;
                        }

                        geneQueue.put(converted);

                    } catch (InterruptedException e) {
                        NcbiGeneConverter.log.warn("Interrupted");
                        break;
                    } catch (Exception e) {
                        NcbiGeneConverter.log.error(e, e);
                        break;
                    }
                }
                producerDone.set(true);
            }
        }, "Converter");

        convertThread.start();
    }

    public boolean isProducerDone() {
        return this.producerDone.get();
    }

    public void setProducerDoneFlag(AtomicBoolean flag) {
        this.producerDone = flag;
    }

    public void setSourceDoneFlag(AtomicBoolean flag) {
        this.sourceDone = flag;
    }

    private PhysicalLocation getPhysicalLocation(NCBIGene2Accession acc, Gene gene) {
        PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
        pl.setChromosome(gene.getPhysicalLocation().getChromosome());
        if (acc.getOrientation() != null) {
            pl.setStrand(acc.getOrientation());
        }
        if (acc.getStartPosition() != null) {
            pl.setNucleotide(acc.getStartPosition());
            pl.setNucleotideLength((int) Math.abs(acc.getEndPosition() - acc.getStartPosition()));
            pl.setBin(SequenceBinUtils.binFromRange(acc.getStartPosition().intValue(),
                    acc.getEndPosition().intValue()));
        }
        return pl;
    }

    private void getChromosomeDetails(NCBIGene2Accession acc, Gene gene) {
        Chromosome chrom = gene.getPhysicalLocation().getChromosome();
        BioSequence chromSeq = BioSequence.Factory.newInstance();
        chromSeq.setName(acc.getGenomicNucleotideAccession());
        chromSeq.setType(SequenceType.WHOLE_CHROMOSOME);
        chromSeq.setTaxon(gene.getTaxon());
        DatabaseEntry dbe = DatabaseEntry.Factory.newInstance();
        dbe.setExternalDatabase(NcbiGeneConverter.genBank);
        dbe.setAccession(acc.getGenomicNucleotideAccession());
        dbe.setAccessionVersion(acc.getGenomicNucleotideAccessionVersion());
        chromSeq.setSequenceDatabaseEntry(dbe);
        try {
            FieldUtils.writeField(chrom, "sequence", chromSeq, true);
        } catch (IllegalAccessException e) {
            e.printStackTrace();
        }

    }

}