Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.core.loader.association; import org.apache.commons.lang3.StringUtils; import ubic.gemma.core.loader.util.QueuingParser; import ubic.gemma.core.loader.util.parser.BasicLineParser; import ubic.gemma.core.ontology.providers.GeneOntologyService; import ubic.gemma.model.association.GOEvidenceCode; import ubic.gemma.model.association.Gene2GOAssociation; import ubic.gemma.model.common.description.Characteristic; import ubic.gemma.model.common.description.DatabaseType; import ubic.gemma.model.common.description.ExternalDatabase; import ubic.gemma.model.genome.Gene; import ubic.gemma.model.genome.Taxon; import ubic.gemma.persistence.util.Settings; import java.io.IOException; import java.io.InputStream; import java.util.*; import java.util.concurrent.BlockingQueue; /** * This parses GO annotations from NCBI. See <a href="ftp://ftp.ncbi.nih.gov/gene/DATA/README">readme</a>. * * <pre> * tax_id: * the unique identifier provided by NCBI Taxonomy * for the species or strain/isolate * GeneID: * the unique identifier for a gene * --note: for genomes previously available from LocusLink, * the identifiers are equivalent * GO ID: * the GO ID, formatted as GO:0000000 * Evidence: * the evidence code in the gene_association file * Qualifier: * a qualifier for the relationship between the gene * and the GO term * GO term: * the term indicated by the GO ID * PubMed: * pipe-delimited set of PubMed uids reported as evidence * for the association * Category: * the GO category (Function, Process, or Component) * </pre> * * @author keshav * @author pavlidis */ public class NCBIGene2GOAssociationParser extends BasicLineParser<Gene2GOAssociation> implements QueuingParser<Gene2GOAssociation> { private static final String COMMENT_INDICATOR = "#"; private static final Set<String> ignoredEvidenceCodes = new HashSet<>(); static { // these are 'NOT association' codes, or (ND) one that means "nothing known", which we don't use. See // http://www.geneontology.org/GO.evidence.shtml. NCBIGene2GOAssociationParser.ignoredEvidenceCodes.add("IMR"); NCBIGene2GOAssociationParser.ignoredEvidenceCodes.add("IKR"); NCBIGene2GOAssociationParser.ignoredEvidenceCodes.add("IRD"); NCBIGene2GOAssociationParser.ignoredEvidenceCodes.add("ND"); } private final int TAX_ID = Settings.getInt("gene2go.tax_id"); private final int EVIDENCE_CODE = Settings.getInt("gene2go.evidence_code"); private final int GENE_ID = Settings.getInt("gene2go.gene_id"); private final int GO_ID = Settings.getInt("gene2go.go_id"); private BlockingQueue<Gene2GOAssociation> queue; private int count = 0; /** * NCBI Ids of available taxa. */ private Map<Integer, Taxon> taxaNcbiIds; /** * @param taxa to consider (usually we pass in all) */ public NCBIGene2GOAssociationParser(Collection<Taxon> taxa) { ExternalDatabase goDb = ExternalDatabase.Factory.newInstance(); goDb.setName("GO"); goDb.setType(DatabaseType.ONTOLOGY); ExternalDatabase ncbiGeneDb = ExternalDatabase.Factory.newInstance(); ncbiGeneDb.setName("Entrez Gene"); this.taxaNcbiIds = new HashMap<>(); for (Taxon taxon : taxa) { this.taxaNcbiIds.put(taxon.getNcbiId(), taxon); if (taxon.getSecondaryNcbiId() != null) { this.taxaNcbiIds.put(taxon.getSecondaryNcbiId(), taxon); } } } public int getCount() { return count; } @Override public Collection<Gene2GOAssociation> getResults() { return null; } @Override protected void addResult(Gene2GOAssociation obj) { count++; } /** * Note that "-" means a missing value, which in practice only occurs in the "qualifier" and "pubmed" columns. * * @param line line * @return Object */ @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use public Gene2GOAssociation mapFromGene2GO(String line) { String[] values = StringUtils.splitPreserveAllTokens(line, "\t"); if (line.startsWith(NCBIGene2GOAssociationParser.COMMENT_INDICATOR)) return null; if (values.length < 8) return null; Integer taxonId; try { taxonId = Integer.parseInt(values[TAX_ID]); } catch (NumberFormatException e) { throw new RuntimeException(e); } if (!taxaNcbiIds.containsKey(taxonId)) { return null; } Gene gene = Gene.Factory.newInstance(); gene.setNcbiGeneId(Integer.parseInt(values[GENE_ID])); gene.setTaxon(taxaNcbiIds.get(taxonId)); Characteristic oe = Characteristic.Factory.newInstance(); String value = values[GO_ID].replace(":", "_"); oe.setValueUri(GeneOntologyService.BASE_GO_URI + value); oe.setValue(value); // g2GOAss.setSource( ncbiGeneDb ); GOEvidenceCode evcode = null; String evidenceCode = values[EVIDENCE_CODE]; if (!(StringUtils.isBlank(evidenceCode) || evidenceCode.equals("-"))) { if (NCBIGene2GOAssociationParser.ignoredEvidenceCodes.contains(evidenceCode)) { return null; } evcode = GOEvidenceCode.fromString(evidenceCode); } Gene2GOAssociation g2GOAss = Gene2GOAssociation.Factory.newInstance(gene, oe, evcode); try { queue.put(g2GOAss); } catch (InterruptedException e) { throw new RuntimeException(e); } return g2GOAss; } @Override public void parse(InputStream inputStream, BlockingQueue<Gene2GOAssociation> aqueue) throws IOException { if (inputStream == null) throw new IllegalArgumentException("InputStream was null"); this.queue = aqueue; super.parse(inputStream); } @Override public Gene2GOAssociation parseOneLine(String line) { return this.mapFromGene2GO(line); } }