ubic.gemma.apps.GenericGenelistDesignGenerator.java Source code

Introduction

Here is the source code for ubic.gemma.apps.GenericGenelistDesignGenerator.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2010 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package ubic.gemma.apps;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.lang.StringUtils;
import ubic.gemma.analysis.report.ArrayDesignReportService;
import ubic.gemma.analysis.service.ArrayDesignAnnotationService;
import ubic.gemma.genome.gene.service.GeneService;
import ubic.gemma.genome.taxon.service.TaxonService;
import ubic.gemma.model.common.auditAndSecurity.eventType.AnnotationBasedGeneMappingEvent;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.common.description.ExternalDatabaseService;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.model.expression.arrayDesign.TechnologyType;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.designElement.CompositeSequenceService;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.BioSequenceService;
import ubic.gemma.model.genome.biosequence.PolymerType;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.model.genome.gene.GeneProduct;
import ubic.gemma.model.genome.gene.GeneProductType;
import ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation;
import ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociationService;
import ubic.gemma.util.AbstractCLIContextCLI;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

/**
 * Creates an array design based on the current set of transcripts for a taxon.
 * <p>
 * This is used to create a 'platform' for linking non-array based data to the system, or data for which we have only
 * gene or transcript-level information.
 * <p>
 * See also: To generate annotation files for all genes in a taxon, this can also accomplished by
 * ArrayDesignAnnotationFileCli. The difference here is that an array design is actually created.
 * 
 * @author paul
 * @version $Id: GenericGenelistDesignGenerator.java,v 1.30 2013/02/18 18:36:46 anton Exp $
 */
public class GenericGenelistDesignGenerator extends AbstractCLIContextCLI {
    public static void main(String[] args) {
        GenericGenelistDesignGenerator b = new GenericGenelistDesignGenerator();
        b.doWork(args);
    }

    private AnnotationAssociationService annotationAssociationService;
    private ArrayDesignAnnotationService arrayDesignAnnotationService;
    private ArrayDesignService arrayDesignService;
    private BioSequenceService bioSequenceService;
    private CompositeSequenceService compositeSequenceService;
    private ExternalDatabaseService externalDatabaseService;
    private ArrayDesignReportService arrayDesignReportService;
    private GeneService geneService;

    private Taxon taxon = null;

    private TaxonService taxonService;

    private boolean useNCBIIds = false;
    private boolean useEnsemblIds = false;

    @SuppressWarnings("static-access")
    @Override
    protected void buildOptions() {
        Option taxonOption = OptionBuilder.hasArg().withDescription("taxon name")
                .withDescription("Taxon of the genes").withLongOpt("taxon").isRequired().create('t');
        addOption(taxonOption);

        addOption(OptionBuilder.withDescription("use NCBI numeric IDs as the identifiers instead of gene symbols")
                .create("ncbiids"));

        addOption(
                OptionBuilder.withDescription("use Ensembl identifiers instead of gene symbols").create("ensembl"));

    }

    @Override
    protected Exception doWork(String[] args) {
        super.processCommandLine("Update or create a 'platform' based on the genes for the organism", args);

        ExternalDatabase genbank = externalDatabaseService.find("Genbank");
        ExternalDatabase ensembl = externalDatabaseService.find("Ensembl");
        assert genbank != null;
        assert ensembl != null;

        /*
         * Create the stub array design for the organism. The name and etc. are generated automatically. If the design
         * exists, we update it.
         */

        String shortName = generateShortName();

        ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance();
        arrayDesign.setShortName(shortName);

        // common name
        arrayDesign.setPrimaryTaxon(taxon);
        // fixme this is ugly just use one variable
        String ncbiIDNameExtra = useNCBIIds ? ", indexed by NCBI IDs" : "";
        String ensemblNameExtra = useEnsemblIds ? ", indexed by Ensembl IDs" : "";

        arrayDesign
                .setName("Generic platform for " + taxon.getScientificName() + ncbiIDNameExtra + ensemblNameExtra);
        arrayDesign.setDescription("Created by Gemma");
        arrayDesign.setTechnologyType(TechnologyType.NONE); // this is key

        if (arrayDesignService.find(arrayDesign) != null) {
            log.info("Platform for " + taxon + " already exists, will update");
            arrayDesign = arrayDesignService.find(arrayDesign);
            arrayDesignService.deleteGeneProductAssociations(arrayDesign);
            arrayDesign = arrayDesignService.load(arrayDesign.getId());

        } else {
            log.info("Creating new 'generic' platform");
            arrayDesign = arrayDesignService.create(arrayDesign);
        }
        arrayDesign = arrayDesignService.thaw(arrayDesign);

        // temporary: making sure we set it, as it is new.
        arrayDesign.setTechnologyType(TechnologyType.NONE);

        /*
         * Load up the genes for the organism.
         */
        Collection<Gene> knownGenes = geneService.loadAll(taxon);
        log.info("Taxon has " + knownGenes.size() + " genes");

        // Map<Gene, CompositeSequence> existingGeneMap = getExistingGeneMap( arrayDesign );
        Map<String, CompositeSequence> existingSymbolmap = getExistingProbeNameMap(arrayDesign);

        int count = 0;
        int numWithNoTranscript = 0;
        // int hasGeneAlready = 0;
        // int numNewGenes = 0;
        int numNewElements = 0;
        int numUpdatedElements = 0;
        for (Gene gene : knownGenes) {
            gene = geneService.thaw(gene);

            Collection<GeneProduct> products = gene.getProducts();

            if (products.isEmpty()) {
                numWithNoTranscript++;
                log.debug("No transcript for " + gene);
                continue;
            }

            count++;

            CompositeSequence csForGene = null;

            if (useNCBIIds) {
                if (gene.getNcbiGeneId() == null) {
                    log.debug("No NCBI ID for " + gene + ", skipping");
                    continue;
                }
                if (existingSymbolmap.containsKey(gene.getNcbiGeneId())) {
                    csForGene = existingSymbolmap.get(gene.getNcbiGeneId());
                }
            } else if (useEnsemblIds) {
                if (gene.getEnsemblId() == null) {
                    log.debug("No Ensembl ID for " + gene + ", skipping");
                    continue;
                }
                if (existingSymbolmap.containsKey(gene.getEnsemblId())) {
                    csForGene = existingSymbolmap.get(gene.getEnsemblId());
                }
            } else {
                if (existingSymbolmap.containsKey(gene.getOfficialSymbol())) {
                    csForGene = existingSymbolmap.get(gene.getOfficialSymbol());
                }
            }

            assert csForGene == null || csForGene.getId() != null : "Null id for " + csForGene;

            /*
             * We arbitrarily link the "probe" to one of the gene's RNA transcripts. We could consider other strategies
             * to pick the representative, but it generally doesn't matter.
             */
            for (GeneProduct geneProduct : products) {
                if (!GeneProductType.RNA.equals(geneProduct.getType())) {
                    continue;
                }

                /*
                 * Name is usually the genbank or ensembl accession
                 */
                String name = geneProduct.getName();
                BioSequence bioSequence = BioSequence.Factory.newInstance();
                Collection<DatabaseEntry> accessions = geneProduct.getAccessions();
                bioSequence.setName(name);
                bioSequence.setTaxon(taxon);

                bioSequence.setPolymerType(PolymerType.RNA);

                // FIXME miRNAs (though, we don't really use this field.)
                bioSequence.setType(SequenceType.mRNA);
                BioSequence existing = null;

                if (accessions.isEmpty()) {
                    // this should not be hit.
                    log.warn("No accession for " + name);
                    DatabaseEntry de = DatabaseEntry.Factory.newInstance();
                    de.setAccession(name);
                    if (name.startsWith("ENS") && name.length() > 10) {
                        de.setExternalDatabase(ensembl);
                    } else {
                        if (name.matches("^[A-Z]{1,2}(_?)[0-9]+(\\.[0-9]+)?$")) {
                            de.setExternalDatabase(genbank);
                        } else {
                            log.info("Name doesn't look like genbank or ensembl, skipping: " + name);
                            continue;
                        }
                    }
                    bioSequence.setSequenceDatabaseEntry(de);
                } else {
                    bioSequence.setSequenceDatabaseEntry(accessions.iterator().next());
                    existing = bioSequenceService.findByAccession(accessions.iterator().next());

                    // FIXME It is possible that this sequence will have been aligned to the genome, which is a bit
                    // confusing. So it will map to a gene. Worse case: it maps to more than one gene ...

                }

                if (existing == null) {
                    bioSequence = (BioSequence) getPersisterHelper().persist(bioSequence);
                } else {
                    bioSequence = existing;
                }

                assert bioSequence != null && bioSequence.getId() != null;

                if (bioSequence.getSequenceDatabaseEntry() == null) {
                    log.info("No DB entry for " + bioSequence + "(" + gene
                            + "), will look for a better sequence to use ...");
                    continue;
                }

                if (csForGene == null) {
                    if (log.isDebugEnabled())
                        log.debug("New element " + " with " + bioSequence + " for " + gene);
                    csForGene = CompositeSequence.Factory.newInstance();
                    if (useNCBIIds) {
                        if (gene.getNcbiGeneId() == null) {
                            continue;
                        }
                        csForGene.setName(gene.getNcbiGeneId().toString());
                    } else if (useEnsemblIds) {
                        if (gene.getEnsemblId() == null) {
                            continue;
                        }
                        csForGene.setName(gene.getEnsemblId());
                    } else {
                        csForGene.setName(gene.getOfficialSymbol());
                    }

                    csForGene.setArrayDesign(arrayDesign);
                    csForGene.setBiologicalCharacteristic(bioSequence);
                    csForGene.setDescription("Generic expression element for " + gene);
                    csForGene = compositeSequenceService.create(csForGene);
                    assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
                    arrayDesign.getCompositeSequences().add(csForGene);
                    numNewElements++;
                } else {
                    if (log.isDebugEnabled())
                        log.debug("Updating existing element: " + csForGene + " with " + bioSequence + " for "
                                + gene);
                    csForGene.setArrayDesign(arrayDesign);
                    csForGene.setBiologicalCharacteristic(bioSequence);
                    csForGene.setDescription("Generic expression element for " + gene);
                    assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene;
                    compositeSequenceService.update(csForGene);

                    // making sure ...
                    csForGene = compositeSequenceService.load(csForGene.getId());
                    assert csForGene.getId() != null;
                    arrayDesign.getCompositeSequences().add(csForGene);

                    numUpdatedElements++;
                }

                assert bioSequence.getId() != null;
                assert geneProduct.getId() != null;
                assert csForGene.getBiologicalCharacteristic() != null
                        && csForGene.getBiologicalCharacteristic().getId() != null;

                AnnotationAssociation aa = AnnotationAssociation.Factory.newInstance();
                aa.setGeneProduct(geneProduct);
                aa.setBioSequence(bioSequence);
                annotationAssociationService.create(aa);

                break;
            }

            if (count % 100 == 0)
                log.info(count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements
                        + " updated elements; " + numWithNoTranscript
                        + " genes had no transcript and were skipped.");
        }

        arrayDesignService.update(arrayDesign);

        log.info("Array design has " + arrayDesignService.numCompositeSequenceWithGenes(arrayDesign)
                + " 'probes' associated with genes.");

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());
        auditTrailService.addUpdateEvent(arrayDesign, AnnotationBasedGeneMappingEvent.Factory.newInstance(),
                count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements
                        + " updated elements; " + numWithNoTranscript
                        + " genes had no transcript and were skipped.");
        try {
            arrayDesignAnnotationService.deleteExistingFiles(arrayDesign);
        } catch (IOException e) {
            log.error("Problem deleting old annotation files: " + e.getMessage());
        }

        log.info("Don't forget to update the annotation files");

        return null;

    }

    @Override
    protected void processOptions() {
        super.processOptions();

        geneService = this.getBean(GeneService.class);
        arrayDesignAnnotationService = this.getBean(ArrayDesignAnnotationService.class);
        taxonService = getBean(TaxonService.class);
        bioSequenceService = getBean(BioSequenceService.class);
        arrayDesignService = getBean(ArrayDesignService.class);
        compositeSequenceService = getBean(CompositeSequenceService.class);
        annotationAssociationService = getBean(AnnotationAssociationService.class);
        externalDatabaseService = getBean(ExternalDatabaseService.class);
        arrayDesignReportService = getBean(ArrayDesignReportService.class);

        if (hasOption('t')) {
            String taxonName = getOptionValue('t');
            this.taxon = taxonService.findByCommonName(taxonName);
            if (taxon == null) {
                log.error("ERROR: Cannot find taxon " + taxonName);
            }
        }
        if (hasOption("ncbiids")) {
            this.useNCBIIds = true;
        } else if (hasOption("ensembl")) {
            this.useEnsemblIds = true;
        }

        if (useNCBIIds && useEnsemblIds) {
            throw new IllegalArgumentException("Choose one of ensembl or ncbi ids or gene symbols");
        }
    }

    /**
     * @return
     */
    private String generateShortName() {
        String ncbiIdSuffix = useNCBIIds ? "_ncbiIds" : "";
        String ensemblIdSuffix = useEnsemblIds ? "_ensemblIds" : "";
        String shortName = "";
        if (StringUtils.isBlank(taxon.getCommonName())) {
            shortName = "Generic_" + StringUtils.strip(taxon.getScientificName()).replaceAll(" ", "_")
                    + ncbiIdSuffix;
        } else {
            shortName = "Generic_" + StringUtils.strip(taxon.getCommonName()).replaceAll(" ", "_") + ncbiIdSuffix
                    + ensemblIdSuffix;
        }
        return shortName;
    }

    /**
     * @param arrayDesign
     * @return
     */
    private Map<String, CompositeSequence> getExistingProbeNameMap(ArrayDesign arrayDesign) {

        Map<String, CompositeSequence> existingElements = new HashMap<String, CompositeSequence>();

        if (arrayDesign.getCompositeSequences().isEmpty())
            return existingElements;

        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            assert cs.getId() != null : "Null id for " + cs;
            existingElements.put(cs.getName(), cs);
        }
        return existingElements;
    }

}