Java tutorial
/* * The Gemma project * * Copyright (c) 2006-2012 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.loader.genome.gene.ncbi; import java.util.Collection; import java.util.HashSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import ubic.gemma.loader.genome.gene.ncbi.model.NCBIGene2Accession; import ubic.gemma.loader.genome.gene.ncbi.model.NCBIGeneInfo; import ubic.gemma.loader.util.converter.Converter; import ubic.gemma.model.common.description.DatabaseEntry; import ubic.gemma.model.common.description.ExternalDatabase; import ubic.gemma.model.genome.Chromosome; import ubic.gemma.model.genome.CytogeneticLocation; import ubic.gemma.model.genome.Gene; import ubic.gemma.model.genome.PhysicalLocation; import ubic.gemma.model.genome.Taxon; import ubic.gemma.model.genome.biosequence.BioSequence; import ubic.gemma.model.genome.biosequence.SequenceType; import ubic.gemma.model.genome.gene.GeneAlias; import ubic.gemma.model.genome.gene.GeneProduct; import ubic.gemma.model.genome.gene.GeneProductType; import ubic.gemma.util.ConfigUtils; import ubic.gemma.util.SequenceBinUtils; /** * Convert NCBIGene2Accession objects into Gemma Gene objects with associated GeneProducts. * * @author pavlidis * @author jrsantos * @version $Id: NcbiGeneConverter.java,v 1.32 2013/02/28 05:33:13 paul Exp $ * @see NCBIGene2Accession, NCBIGeneInfo */ public class NcbiGeneConverter implements Converter<Object, Object> { // configured in project.properties, override in Gemma.properties private static final String RETAIN_PROTEIN_INFO_PARAM = "gemma.store.ncbi.proteininfo"; private static Log log = LogFactory.getLog(NcbiGeneConverter.class.getName()); AtomicBoolean producerDone = new AtomicBoolean(false); AtomicBoolean sourceDone = new AtomicBoolean(false); private static ExternalDatabase genBank; private static ExternalDatabase ensembl; private static boolean retainProteinInformation = ConfigUtils.getBoolean(RETAIN_PROTEIN_INFO_PARAM, false); static { genBank = ExternalDatabase.Factory.newInstance(); genBank.setName("Genbank"); ensembl = ExternalDatabase.Factory.newInstance(); ensembl.setName("Ensembl"); } /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Converter#convert(java.util.Collection) */ @Override public Collection<Object> convert(Collection<? extends Object> sourceDomainObjects) { Collection<Object> results = new HashSet<Object>(); for (Object object : sourceDomainObjects) { results.add(this.convert(object)); } return results; } /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Converter#convert(java.lang.Object) */ public Gene convert(NCBIGeneInfo info) { Gene gene = Gene.Factory.newInstance(); gene.setNcbiGeneId(Integer.parseInt(info.getGeneId())); gene.setName(info.getDefaultSymbol()); gene.setOfficialSymbol(info.getDefaultSymbol()); gene.setOfficialName(info.getDescription()); gene.setEnsemblId(info.getEnsemblId()); /* * NOTE we allow multiple discontinued or previous ids, separated by commas. This is a hack to account for cases * uncovered recently...can be minimized by running this regularly. */ if (info.getHistory() != null) { if (info.getHistory().getCurrentId() != null) { assert info.getGeneId().equals(info.getHistory().getCurrentId()); } assert info.getHistory().getPreviousIds() != null; if (!info.getHistory().getPreviousIds().isEmpty()) { String previousIds = StringUtils.join(info.getHistory().getPreviousIds(), ","); gene.setPreviousNcbiId(previousIds); } } else if (StringUtils.isNotBlank(info.getDiscontinuedId())) { if (log.isDebugEnabled()) log.debug("Gene matches a gene that was discontinued: " + gene + " matches gene that had id " + info.getDiscontinuedId()); gene.setPreviousNcbiId(info.getDiscontinuedId()); } gene.setDescription("Imported from NCBI gene; Nomenclature status: " + info.getNomenclatureStatus()); Taxon t = Taxon.Factory.newInstance(); t.setNcbiId(info.getTaxId()); t.setIsGenesUsable(false); t.setIsSpecies(true); gene.setTaxon(t); PhysicalLocation pl = PhysicalLocation.Factory.newInstance(); Chromosome chrom = Chromosome.Factory.newInstance(); chrom.setTaxon(t); chrom.setName(info.getChromosome()); pl.setChromosome(chrom); CytogeneticLocation cl = CytogeneticLocation.Factory.newInstance(); cl.setChromosome(chrom); cl.setBand(info.getMapLocation()); gene.setPhysicalLocation(pl); gene.setCytogenicLocation(cl); Collection<GeneAlias> aliases = gene.getAliases(); for (String alias : info.getSynonyms()) { GeneAlias newAlias = GeneAlias.Factory.newInstance(); newAlias.setAlias(alias); aliases.add(newAlias); } for (String dbname : info.getDbXrefs().keySet()) { if (!dbname.equalsIgnoreCase("Ensembl")) continue; String identifier = info.getDbXrefs().get(dbname); DatabaseEntry crossref = DatabaseEntry.Factory.newInstance(); crossref.setAccession(identifier); crossref.setExternalDatabase(getEnsembl()); gene.getAccessions().add(crossref); } return gene; } /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Converter#convert(java.lang.Object) */ @Override @SuppressWarnings("unchecked") public Object convert(Object sourceDomainObject) { if (sourceDomainObject instanceof Collection) { return this.convert((Collection<Object>) sourceDomainObject); } assert sourceDomainObject instanceof NCBIGene2Accession; NCBIGene2Accession ncbiGene = (NCBIGene2Accession) sourceDomainObject; return convert(ncbiGene.getInfo()); } /** * @param acc * @param gene * @return */ public Collection<GeneProduct> convert(NCBIGene2Accession acc, Gene gene) { Collection<GeneProduct> geneProducts = new HashSet<GeneProduct>(); // initialize up to two Gene Products // one for RNA, one for Protein (if retainProteinInformation = true) // RNA section if (acc.getRnaNucleotideAccession() != null) { GeneProduct rna = GeneProduct.Factory.newInstance(); // set available fields rna.setNcbiGi(acc.getRnaNucleotideGI()); rna.setGene(gene); rna.setName(acc.getRnaNucleotideAccession()); rna.setType(GeneProductType.RNA); String description = "Imported from NCBI Gene"; if (acc.getStatus() != null) { description = description + " (Refseq status: " + acc.getStatus() + ")."; } if (acc.getRnaNucleotideAccession() != null) { DatabaseEntry accession = DatabaseEntry.Factory.newInstance(); accession.setAccession(acc.getRnaNucleotideAccession()); accession.setAccessionVersion(acc.getRnaNucleotideAccessionVersion()); accession.setExternalDatabase(genBank); if (rna.getAccessions() == null) { rna.setAccessions(new HashSet<DatabaseEntry>()); } rna.getAccessions().add(accession); } /* * Fill in physical location details. */ if (acc.getGenomicNucleotideAccession() != null && gene.getPhysicalLocation() != null) { getChromosomeDetails(acc, gene); PhysicalLocation pl = getPhysicalLocation(acc, gene); rna.setPhysicalLocation(pl); } rna.setDescription(description); geneProducts.add(rna); } // Protein section if (retainProteinInformation && acc.getProteinAccession() != null) { GeneProduct protein = GeneProduct.Factory.newInstance(); // set available fields protein.setNcbiGi(acc.getProteinGI()); protein.setGene(gene); protein.setName(acc.getProteinAccession()); protein.setType(GeneProductType.PROTEIN); protein.setDescription( "Imported from NCBI Gene" + (acc.getStatus() != null ? " (" + acc.getStatus() + ")" : "")); DatabaseEntry accession = DatabaseEntry.Factory.newInstance(); accession.setAccession(acc.getProteinAccession()); accession.setAccessionVersion(acc.getProteinAccessionVersion()); accession.setExternalDatabase(genBank); Collection<DatabaseEntry> accessions = new HashSet<DatabaseEntry>(); accessions.add(accession); protein.setAccessions(accessions); geneProducts.add(protein); } return geneProducts; } /** * @param acc * @param gene * @return */ private PhysicalLocation getPhysicalLocation(NCBIGene2Accession acc, Gene gene) { PhysicalLocation pl = PhysicalLocation.Factory.newInstance(); pl.setChromosome(gene.getPhysicalLocation().getChromosome()); if (acc.getOrientation() != null) { pl.setStrand(acc.getOrientation()); } if (acc.getStartPosition() != null) { pl.setNucleotide(acc.getStartPosition()); pl.setNucleotideLength((int) Math.abs(acc.getEndPosition() - acc.getStartPosition())); pl.setBin(SequenceBinUtils.binFromRange(acc.getStartPosition().intValue(), acc.getEndPosition().intValue())); } return pl; } /** * @param acc * @param gene */ private void getChromosomeDetails(NCBIGene2Accession acc, Gene gene) { Chromosome chrom = gene.getPhysicalLocation().getChromosome(); BioSequence chromSeq = BioSequence.Factory.newInstance(); chromSeq.setName(acc.getGenomicNucleotideAccession()); chromSeq.setType(SequenceType.WHOLE_CHROMOSOME); chromSeq.setTaxon(gene.getTaxon()); DatabaseEntry dbe = DatabaseEntry.Factory.newInstance(); dbe.setExternalDatabase(genBank); dbe.setAccession(acc.getGenomicNucleotideAccession()); dbe.setAccessionVersion(acc.getGenomicNucleotideAccessionVersion()); chromSeq.setSequenceDatabaseEntry(dbe); chrom.setSequence(chromSeq); } public Gene convert(NcbiGeneData data) { // get gene info and fill in gene NCBIGeneInfo geneInfo = data.getGeneInfo(); Gene gene = convert(geneInfo); // grab all accessions and fill in GeneProduct/DatabaseEntry // and associate with Gene Collection<NCBIGene2Accession> gene2accession = data.getAccessions(); Collection<GeneProduct> geneProducts = new HashSet<GeneProduct>(); for (NCBIGene2Accession acc : gene2accession) { geneProducts.addAll(convert(acc, gene)); } gene.setProducts(geneProducts); return gene; } /* * Threaded conversion of domain objects to Gemma objects. */ public void convert(final BlockingQueue<NcbiGeneData> geneInfoQueue, final BlockingQueue<Gene> geneQueue) { // start up thread to convert a member of geneInfoQueue to a gene/geneproduct/databaseentry // then push the gene onto the geneQueue for loading if (!retainProteinInformation) { log.info("Note that protein information will be ignored; set " + RETAIN_PROTEIN_INFO_PARAM + " to true to change"); } Thread convertThread = new Thread(new Runnable() { @Override @SuppressWarnings("synthetic-access") public void run() { while (!(sourceDone.get() && geneInfoQueue.isEmpty())) { try { NcbiGeneData data = geneInfoQueue.poll(); if (data == null) { continue; } Gene converted = convert(data); geneQueue.put(converted); } catch (InterruptedException e) { log.warn("Interrupted"); break; } catch (Exception e) { log.error(e, e); break; } } producerDone.set(true); } }, "Converter"); convertThread.start(); } public boolean isProducerDone() { return this.producerDone.get(); } public void setProducerDoneFlag(AtomicBoolean flag) { this.producerDone = flag; } public void setSourceDoneFlag(AtomicBoolean flag) { this.sourceDone = flag; } /** * @return the genBank */ public static ExternalDatabase getGenbank() { return genBank; } /** * @return the ensembl */ public static ExternalDatabase getEnsembl() { return ensembl; } }