ubic.gemma.loader.genome.gene.ncbi.NcbiGeneLoader.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.genome.gene.ncbi.NcbiGeneLoader.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.genome.gene.ncbi;

import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.security.core.context.SecurityContext;
import org.springframework.security.core.context.SecurityContextHolder;

import ubic.gemma.genome.taxon.service.TaxonService;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.persistence.Persister;

/**
 * Load or update information about genes from the NCBI Gene database.
 * 
 * @author jsantos, paul
 * @version $Id: NcbiGeneLoader.java,v 1.32 2013/02/28 00:20:50 paul Exp $
 */
public class NcbiGeneLoader {
    private static Log log = LogFactory.getLog(NcbiGeneConverter.class.getName());
    private static final int QUEUE_SIZE = 1000;

    private AtomicBoolean generatorDone;
    private AtomicBoolean converterDone;
    private AtomicBoolean loaderDone;
    private Persister persisterHelper;
    private int loadedGeneCount = 0;
    private TaxonService taxonService;

    // whether to fetch files from ncbi or use existing ones
    private boolean doDownload = true;
    private Integer startingNcbiId = null;

    public NcbiGeneLoader() {
        generatorDone = new AtomicBoolean(false);
        converterDone = new AtomicBoolean(false);
        loaderDone = new AtomicBoolean(false);
    }

    public NcbiGeneLoader(Persister persisterHelper) {
        this();
        this.setPersisterHelper(persisterHelper);
    }

    /**
     * @return the loadedGeneCount
     */
    public int getLoadedGeneCount() {
        return loadedGeneCount;
    }

    public boolean isLoaderDone() {
        return loaderDone.get();
    }

    /**
     * download the gene_info and gene2accession files, then call load
     * 
     * @return
     * @throws IOException
     */
    public void load(boolean filterTaxa) {
        String geneInfoFile = "";
        String gene2AccFile = "";
        String geneHistoryFile = "";
        String geneEnsemblFile = "";
        load(geneInfoFile, gene2AccFile, geneHistoryFile, geneEnsemblFile, filterTaxa);
    }

    /**
     * @param geneInfoFile the gene_info file
     * @param gene2AccFile the gene2accession file
     * @param historyfile
     * @param ensembl mapping file
     * @param filterTaxa should we filter out taxa we're not supporting
     */
    public void load(String geneInfoFile, String gene2AccFile, String geneHistoryFile, String geneEnsemblFile,
            boolean filterTaxa) {

        Collection<Taxon> supportedTaxa = null;
        if (filterTaxa) {
            supportedTaxa = this.taxonService.loadAll();
        }
        load(geneInfoFile, gene2AccFile, geneHistoryFile, geneEnsemblFile, supportedTaxa);

    }

    /**
     * @param geneInfoFile
     * @param gene2AccFile
     * @param geneHistoryFile
     * @param geneEnsemblFile
     * @param t the specific taxon to process
     */
    public void load(String geneInfoFile, String gene2AccFile, String geneHistoryFile, String geneEnsemblFile,
            Taxon t) {

        Collection<Taxon> taxaToUse = new HashSet<Taxon>();
        taxaToUse.add(t);

        this.load(geneInfoFile, gene2AccFile, geneHistoryFile, geneEnsemblFile, taxaToUse);

    }

    public void load(Taxon t) {
        String geneInfoFile = "";
        String gene2AccFile = "";
        String geneHistoryFile = "";
        String geneEnsemblFile = "";
        load(geneInfoFile, gene2AccFile, geneHistoryFile, geneEnsemblFile, t);
    }

    /**
     * @param persisterHelper the persisterHelper to set
     */
    public void setPersisterHelper(Persister persisterHelper) {
        this.persisterHelper = persisterHelper;
    }

    public void setTaxonService(TaxonService bean) {
        this.taxonService = bean;

    }

    /**
     * Method to update taxon to indicate that genes have been loaded for that taxon are are usable. If there is a
     * parent taxon for this species and it has genes loaded against it then use that parent's taxons genes rather than
     * the species found in NCBI. Set the flag genesUSable to false for that child taxon that was found in ncbi.
     * 
     * @param taxaGenesLoaded List of taxa that have had genes loaded into GEMMA from NCBI.
     */
    public void updateTaxaWithGenesUsable(Collection<Taxon> taxaGenesLoaded) {

        if (taxaGenesLoaded != null && !taxaGenesLoaded.isEmpty()) {
            for (Taxon taxon : taxaGenesLoaded) {

                if (taxon == null) {
                    log.warn("null taxon");
                    continue;
                }

                Boolean genesUsableParent = false;
                Taxon parentTaxon = taxon.getParentTaxon();
                if (parentTaxon != null && parentTaxon.getIsGenesUsable()) {
                    genesUsableParent = true;
                    taxon.setIsGenesUsable(false);
                    taxonService.update(taxon);
                    log.debug("Parent taxon found: " + parentTaxon + ": Not using genes from taxon: " + taxon);
                }
                if (!taxon.getIsGenesUsable() && !genesUsableParent) {
                    taxon.setIsGenesUsable(true);
                    taxonService.update(taxon);
                    log.debug("Updating taxon genes usable to true for taxon " + taxon);
                }
            }
        } else {
            throw new IllegalArgumentException("No taxa were processed for this NCBI load");
        }
    }

    /**
     * @param geneQueue
     */
    void doLoad(final BlockingQueue<Gene> geneQueue) {
        StopWatch timer = new StopWatch();
        timer.start();
        int skipped = 0;
        while (!(converterDone.get() && geneQueue.isEmpty())) {
            Gene gene = null;
            try {
                // the converted genes.
                gene = geneQueue.poll();
                if (gene == null) {
                    continue;
                }

                if (gene.getProducts().isEmpty()) {
                    // // log.warn( gene + " has no products, skipping" ); // common!!!
                    // skipped++;
                }

                persisterHelper.persistOrUpdate(gene);

                if (++loadedGeneCount % 1000 == 0 || timer.getTime() > 30 * 1000) {
                    log.info("Processed " + loadedGeneCount + " genes. Queue has " + geneQueue.size()
                            + " items; last gene: " + gene);
                    if (skipped > 0) {
                        log.info(skipped + " skipped because they had no gene products.");
                    }
                    timer.reset();
                    timer.start();
                }

            } catch (Exception e) {
                log.error("Error while loading gene: " + gene + ": " + e.getMessage(), e);
                loaderDone.set(true);
                throw new RuntimeException(e);
            }
        }
        log.info("Loaded " + loadedGeneCount + " genes. ");
        loaderDone.set(true);
    }

    /**
     * @param geneQueue a blocking queue of genes to be loaded into the database loads genes into the database
     * @param geneQueue
     */
    private void load(final BlockingQueue<Gene> geneQueue) {
        final SecurityContext context = SecurityContextHolder.getContext();
        assert context != null;

        Thread loadThread = new Thread(new Runnable() {
            @Override
            public void run() {
                SecurityContextHolder.setContext(context);
                doLoad(geneQueue);
            }

        }, "Loading");
        loadThread.start();

        while (!generatorDone.get() || !converterDone.get() || !loaderDone.get()) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * @param geneInfoFile
     * @param gene2AccFile
     * @param geneHistoryFile
     * @param geneEnsemblFile
     * @param supportedTaxa can be null if we just want everything
     */
    private void load(String geneInfoFile, String gene2AccFile, String geneHistoryFile, String geneEnsemblFile,
            Collection<Taxon> supportedTaxa) {
        /*
         * In case this is reused.
         */
        this.generatorDone.set(false);
        this.converterDone.set(false);
        this.loaderDone.set(false);

        NcbiGeneDomainObjectGenerator sdog = new NcbiGeneDomainObjectGenerator(supportedTaxa);
        sdog.setDoDownload(doDownload);
        sdog.setProducerDoneFlag(generatorDone);
        sdog.setStartingNcbiId(startingNcbiId);

        NcbiGeneConverter converter = new NcbiGeneConverter();
        converter.setSourceDoneFlag(generatorDone);
        converter.setProducerDoneFlag(converterDone);

        // create queue for GeneInfo objects
        final BlockingQueue<NcbiGeneData> geneInfoQueue = new ArrayBlockingQueue<NcbiGeneData>(QUEUE_SIZE);
        final BlockingQueue<Gene> geneQueue = new ArrayBlockingQueue<Gene>(QUEUE_SIZE);

        // Threaded producer - loading files into queue as GeneInfo objects
        if (StringUtils.isEmpty(geneInfoFile) || StringUtils.isEmpty(geneInfoFile)) {
            sdog.generate(geneInfoQueue);
        } else {
            sdog.generateLocal(geneInfoFile, gene2AccFile, geneHistoryFile, geneEnsemblFile, geneInfoQueue);
        }

        // Threaded consumer/producer - consumes GeneInfo objects and generates
        // Gene/GeneProduct/DatabaseEntry entries
        converter.convert(geneInfoQueue, geneQueue);

        // Threaded consumer. Consumes Gene objects and persists them into
        // the database
        this.load(geneQueue);

        // update taxon table to indicate that now there are genes loaded for that taxa.
        // all or nothing so that if fails for some taxa then no taxa will be updated.
        this.updateTaxaWithGenesUsable(sdog.getSupportedTaxaWithNCBIGenes());
    }

    /**
     * Set to true to avoid downloading the files, if copies already exist (not recommended if you want an update!)
     * 
     * @param skipDownload
     */
    public void setSkipDownload(boolean skipDownload) {
        this.doDownload = !skipDownload;

    }

    /**
     * Indicate
     * 
     * @param startNcbiid
     */
    public void setStartingNcbiId(Integer startNcbiid) {
        this.startingNcbiId = startNcbiid;
    }

}