Java tutorial
/* * The Gemma project * * Copyright (c) 2010 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package ubic.gemma.core.apps; import java.util.Collection; import java.util.HashMap; import java.util.Map; import org.apache.commons.lang3.StringUtils; import ubic.gemma.core.analysis.report.ArrayDesignReportService; import ubic.gemma.core.analysis.service.ArrayDesignAnnotationService; import ubic.gemma.core.apps.GemmaCLI.CommandGroup; import ubic.gemma.core.genome.gene.service.GeneService; import ubic.gemma.core.util.AbstractCLI; import ubic.gemma.core.util.AbstractCLIContextCLI; import ubic.gemma.model.common.auditAndSecurity.eventType.AnnotationBasedGeneMappingEvent; import ubic.gemma.model.common.description.DatabaseEntry; import ubic.gemma.model.common.description.ExternalDatabase; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.arrayDesign.TechnologyType; import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.genome.Gene; import ubic.gemma.model.genome.Taxon; import ubic.gemma.model.genome.biosequence.BioSequence; import ubic.gemma.model.genome.biosequence.PolymerType; import ubic.gemma.model.genome.biosequence.SequenceType; import ubic.gemma.model.genome.gene.GeneProduct; import ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation; import ubic.gemma.persistence.service.common.description.ExternalDatabaseService; import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService; import ubic.gemma.persistence.service.expression.designElement.CompositeSequenceService; import ubic.gemma.persistence.service.genome.biosequence.BioSequenceService; import ubic.gemma.persistence.service.genome.sequenceAnalysis.AnnotationAssociationService; import ubic.gemma.persistence.service.genome.taxon.TaxonService; /** * Creates an array design based on the current set of transcripts for a taxon. * This is used to create a 'platform' for linking non-array based data to the system, or data for which we have only * gene or transcript-level information. * See also: To generate annotation files for all genes in a taxon, this can also accomplished by * ArrayDesignAnnotationFileCli. The difference here is that an array design is actually created. * * @author paul */ public class GenericGenelistDesignGenerator extends AbstractCLIContextCLI { private AnnotationAssociationService annotationAssociationService; private ArrayDesignAnnotationService arrayDesignAnnotationService; private ArrayDesignReportService arrayDesignReportService; private ArrayDesignService arrayDesignService; private BioSequenceService bioSequenceService; private CompositeSequenceService compositeSequenceService; private ExternalDatabaseService externalDatabaseService; private GeneService geneService; private Taxon taxon = null; private boolean useEnsemblIds = false; private boolean useNCBIIds = false; public static void main(String[] args) { GenericGenelistDesignGenerator b = new GenericGenelistDesignGenerator(); executeCommand(b, args); } @Override public CommandGroup getCommandGroup() { return CommandGroup.PLATFORM; } @Override public String getCommandName() { return "genericPlatform"; } @SuppressWarnings("static-access") @Override protected void buildOptions() { super.addOption("t", "taxon", "Taxon of the genes", "taxon"); super.addOption("ncbiids", "use NCBI numeric IDs as the identifiers instead of gene symbols"); super.addOption("ensembl", "use Ensembl identifiers instead of gene symbols"); } @Override protected Exception doWork(String[] args) { Exception exception = super.processCommandLine(args); if (exception != null) { return exception; } ExternalDatabase genbank = externalDatabaseService.findByName("Genbank"); ExternalDatabase ensembl = externalDatabaseService.findByName("Ensembl"); assert genbank != null; assert ensembl != null; /* * Create the stub array design for the organism. The name and etc. are generated automatically. If the design * exists, we update it. */ String shortName = this.generateShortName(); ArrayDesign arrayDesign = ArrayDesign.Factory.newInstance(); arrayDesign.setShortName(shortName); // common name arrayDesign.setPrimaryTaxon(taxon); String nameExt = useNCBIIds ? ", indexed by NCBI IDs" : useEnsemblIds ? ", indexed by Ensembl IDs" : ""; arrayDesign.setName("Generic platform for " + taxon.getScientificName() + nameExt); arrayDesign.setDescription("Created by Gemma"); arrayDesign.setTechnologyType(TechnologyType.GENELIST); // this is key if (arrayDesignService.find(arrayDesign) != null) { AbstractCLI.log.info("Platform for " + taxon + " already exists, will update"); arrayDesign = arrayDesignService.find(arrayDesign); arrayDesignService.deleteGeneProductAssociations(arrayDesign); arrayDesign = arrayDesignService.load(arrayDesign.getId()); } else { AbstractCLI.log.info("Creating new 'generic' platform"); arrayDesign = arrayDesignService.create(arrayDesign); } arrayDesign = arrayDesignService.thaw(arrayDesign); // temporary: making sure we set it, as it is new. arrayDesign.setTechnologyType(TechnologyType.GENELIST); /* * Load up the genes for the organism. */ Collection<Gene> knownGenes = geneService.loadAll(taxon); AbstractCLI.log.info("Taxon has " + knownGenes.size() + " genes"); // this would be good for cases where the identifier we are using has changed. Map<Gene, CompositeSequence> existingGeneMap = new HashMap<>(); if (!useNCBIIds && !useEnsemblIds) { // only using this for symbol changes. existingGeneMap = this.getExistingGeneMap(arrayDesign); } Map<String, CompositeSequence> existingSymbolMap = this.getExistingProbeNameMap(arrayDesign); int count = 0; int numWithNoTranscript = 0; // int hasGeneAlready = 0; // int numNewGenes = 0; int numNewElements = 0; int numUpdatedElements = 0; for (Gene gene : knownGenes) { gene = geneService.thaw(gene); Collection<GeneProduct> products = gene.getProducts(); if (products.isEmpty()) { numWithNoTranscript++; AbstractCLI.log.debug("No transcript for " + gene); continue; } count++; CompositeSequence csForGene = null; if (useNCBIIds) { if (gene.getNcbiGeneId() == null) { AbstractCLI.log.debug("No NCBI ID for " + gene + ", skipping"); continue; } if (existingSymbolMap.containsKey(gene.getNcbiGeneId().toString())) { csForGene = existingSymbolMap.get(gene.getNcbiGeneId().toString()); } } else if (useEnsemblIds) { if (gene.getEnsemblId() == null) { AbstractCLI.log.debug("No Ensembl ID for " + gene + ", skipping"); continue; } if (existingSymbolMap.containsKey(gene.getEnsemblId())) { csForGene = existingSymbolMap.get(gene.getEnsemblId()); } } else { /* * detect when the symbol has changed */ if (existingSymbolMap.containsKey(gene.getOfficialSymbol())) { csForGene = existingSymbolMap.get(gene.getOfficialSymbol()); } else if (existingGeneMap.containsKey(gene)) { csForGene = existingGeneMap.get(gene); AbstractCLI.log.debug("Gene symbol has changed for: " + gene + "? Current element has name=" + csForGene.getName()); csForGene.setName(gene.getOfficialSymbol()); } } assert csForGene == null || csForGene.getId() != null : "Null id for " + csForGene; /* * We arbitrarily link the "probe" to one of the gene's RNA transcripts. We could consider other strategies * to pick the representative, but it generally doesn't matter. */ for (GeneProduct geneProduct : products) { /* * Name is usually the genbank or ensembl accession */ String name = geneProduct.getName(); BioSequence bioSequence = BioSequence.Factory.newInstance(); Collection<DatabaseEntry> accessions = geneProduct.getAccessions(); bioSequence.setName(name); bioSequence.setTaxon(taxon); bioSequence.setPolymerType(PolymerType.RNA); bioSequence.setType(SequenceType.mRNA); BioSequence existing = null; if (accessions.isEmpty()) { // this should not be hit. AbstractCLI.log.warn("No accession for " + name); DatabaseEntry de = DatabaseEntry.Factory.newInstance(); de.setAccession(name); if (name.startsWith("ENS") && name.length() > 10) { de.setExternalDatabase(ensembl); } else { if (name.matches("^[A-Z]{1,2}(_?)[0-9]+(\\.[0-9]+)?$")) { de.setExternalDatabase(genbank); } else { AbstractCLI.log.info("Name doesn't look like genbank or ensembl, skipping: " + name); continue; } } bioSequence.setSequenceDatabaseEntry(de); } else { bioSequence.setSequenceDatabaseEntry(accessions.iterator().next()); existing = bioSequenceService.findByAccession(accessions.iterator().next()); // FIXME It is possible that this sequence will have been aligned to the genome, which is a bit // confusing. So it will map to a gene. Worse case: it maps to more than one gene ... } if (existing == null) { bioSequence = (BioSequence) this.getPersisterHelper().persist(bioSequence); } else { bioSequence = existing; } assert bioSequence != null && bioSequence.getId() != null; if (bioSequence.getSequenceDatabaseEntry() == null) { AbstractCLI.log.info("No DB entry for " + bioSequence + "(" + gene + "), will look for a better sequence to use ..."); continue; } if (csForGene == null) { if (AbstractCLI.log.isDebugEnabled()) AbstractCLI.log.debug("New element " + " with " + bioSequence + " for " + gene); csForGene = CompositeSequence.Factory.newInstance(); if (useNCBIIds) { csForGene.setName(gene.getNcbiGeneId().toString()); } else if (useEnsemblIds) { csForGene.setName(gene.getEnsemblId()); } else { csForGene.setName(gene.getOfficialSymbol()); } csForGene.setArrayDesign(arrayDesign); csForGene.setBiologicalCharacteristic(bioSequence); csForGene.setDescription("Generic expression element for " + gene); csForGene = compositeSequenceService.create(csForGene); assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene; arrayDesign.getCompositeSequences().add(csForGene); numNewElements++; } else { boolean changed = false; assert csForGene.getId() != null : "No id for " + csForGene + " for " + gene; if (!csForGene.getArrayDesign().equals(arrayDesign)) { // does this happen? log.debug("Platform changed? " + csForGene + " on " + csForGene.getArrayDesign()); csForGene.setArrayDesign(arrayDesign); csForGene.setDescription("Generic expression element for " + gene); changed = true; } if (!csForGene.getBiologicalCharacteristic().equals(bioSequence)) { // does this happen? csForGene.setBiologicalCharacteristic(bioSequence); changed = true; } if (changed) { compositeSequenceService.update(csForGene); } // making sure ... csForGene = compositeSequenceService.load(csForGene.getId()); assert csForGene.getId() != null; arrayDesign.getCompositeSequences().add(csForGene); if (changed) { if (AbstractCLI.log.isDebugEnabled()) AbstractCLI.log.debug("Updating existing element: " + csForGene + " with " + bioSequence + " for " + gene); numUpdatedElements++; } } assert bioSequence.getId() != null; assert geneProduct.getId() != null; assert csForGene.getBiologicalCharacteristic() != null && csForGene.getBiologicalCharacteristic().getId() != null; AnnotationAssociation aa = AnnotationAssociation.Factory.newInstance(); aa.setGeneProduct(geneProduct); aa.setBioSequence(bioSequence); annotationAssociationService.create(aa); break; } if (count % 100 == 0) AbstractCLI.log.info(count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped."); } AbstractCLI.log.info("Array design has " + arrayDesignService.numCompositeSequenceWithGenes(arrayDesign) + " 'probes' associated with genes."); arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId()); auditTrailService.addUpdateEvent(arrayDesign, AnnotationBasedGeneMappingEvent.Factory.newInstance(), count + " genes processed; " + numNewElements + " new elements; " + numUpdatedElements + " updated elements; " + numWithNoTranscript + " genes had no transcript and were skipped."); arrayDesignAnnotationService.deleteExistingFiles(arrayDesign); AbstractCLI.log.info("Don't forget to update the annotation files"); return null; } @Override public String getShortDesc() { return "Update or create a 'platform' based on the genes for the organism"; } @Override protected void processOptions() { super.processOptions(); geneService = this.getBean(GeneService.class); arrayDesignAnnotationService = this.getBean(ArrayDesignAnnotationService.class); TaxonService taxonService = this.getBean(TaxonService.class); bioSequenceService = this.getBean(BioSequenceService.class); arrayDesignService = this.getBean(ArrayDesignService.class); compositeSequenceService = this.getBean(CompositeSequenceService.class); annotationAssociationService = this.getBean(AnnotationAssociationService.class); externalDatabaseService = this.getBean(ExternalDatabaseService.class); arrayDesignReportService = this.getBean(ArrayDesignReportService.class); if (this.hasOption('t')) { this.taxon = this.setTaxonByName(taxonService); } if (this.hasOption("ncbiids")) { this.useNCBIIds = true; } else if (this.hasOption("ensembl")) { this.useEnsemblIds = true; } if (useNCBIIds && useEnsemblIds) { throw new IllegalArgumentException("Choose one of ensembl or ncbi ids or gene symbols"); } } private String generateShortName() { String ncbiIdSuffix = useNCBIIds ? "_ncbiIds" : ""; String ensemblIdSuffix = useEnsemblIds ? "_ensemblIds" : ""; String shortName; if (StringUtils.isBlank(taxon.getCommonName())) { shortName = "Generic_" + StringUtils.strip(taxon.getScientificName()).replaceAll(" ", "_") + ncbiIdSuffix; } else { shortName = "Generic_" + StringUtils.strip(taxon.getCommonName()).replaceAll(" ", "_") + ncbiIdSuffix + ensemblIdSuffix; } return shortName; } /** * For gene symbols. */ private Map<Gene, CompositeSequence> getExistingGeneMap(ArrayDesign arrayDesign) { Map<Gene, CompositeSequence> existingElements = new HashMap<>(); if (arrayDesign.getCompositeSequences().isEmpty()) return existingElements; AbstractCLI.log.info("Loading genes for existing platform ..."); Map<CompositeSequence, Collection<Gene>> geneMap = compositeSequenceService .getGenes(arrayDesign.getCompositeSequences()); AbstractCLI.log.info("Platform has genes already for " + geneMap.size() + "/" + arrayDesign.getCompositeSequences().size() + " elements."); for (CompositeSequence cs : geneMap.keySet()) { Collection<Gene> genes = geneMap.get(cs); /* * Two genes with the same symbol, but might be a mistake from an earlier run. */ Gene g = null; if (genes.size() > 1) { AbstractCLI.log.warn("More than one gene for: " + cs + ": " + StringUtils.join(genes, ";")); for (Gene cg : genes) { if (cg.getOfficialSymbol().equals(cs.getName())) { g = cg; } } } else { g = genes.iterator().next(); } existingElements.put(g, cs); } return existingElements; } private Map<String, CompositeSequence> getExistingProbeNameMap(ArrayDesign arrayDesign) { Map<String, CompositeSequence> existingElements = new HashMap<>(); if (arrayDesign.getCompositeSequences().isEmpty()) return existingElements; for (CompositeSequence cs : arrayDesign.getCompositeSequences()) { assert cs.getId() != null : "Null id for " + cs; existingElements.put(cs.getName(), cs); } return existingElements; } }