Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.core.loader.genome.gene.ncbi; import org.apache.commons.lang3.StringUtils; import ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo; import ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo.NomenclatureStatus; import ubic.gemma.core.loader.util.QueuingParser; import ubic.gemma.core.loader.util.parser.BasicLineMapParser; import ubic.gemma.core.loader.util.parser.FileFormatException; import java.io.IOException; import java.io.InputStream; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.concurrent.BlockingQueue; /** * Class to parse the gene_info file from NCBI Gene. See <a href="ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README">readme</a> for details * of the format. * * @author pavlidis */ public class NcbiGeneInfoParser extends BasicLineMapParser<String, NCBIGeneInfo> implements QueuingParser<String> { private static final int NCBI_GENEINFO_FIELDS_PER_ROW = 16; private final Map<String, NCBIGeneInfo> results = new HashMap<>(); private BlockingQueue<String> resultsKeys; private boolean filter = true; private Collection<Integer> ncbiTaxonIds; @Override public boolean containsKey(String key) { return results.containsKey(key); } @Override public NCBIGeneInfo get(String key) { return results.get(key); } @Override public Collection<String> getKeySet() { return results.keySet(); } @Override public Collection<NCBIGeneInfo> getResults() { return results.values(); } @Override public NCBIGeneInfo parseOneLine(String line) { String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) { //noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay if (fields.length == 13 || fields.length == 14 || fields.length == 15) { // They keep adding fields at the end...we only need the first few. } else { throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW); } } NCBIGeneInfo geneInfo = new NCBIGeneInfo(); try { // Skip taxa that we don't support. int taxonId = Integer.parseInt(fields[0]); if (filter && ncbiTaxonIds != null) { if (!ncbiTaxonIds.contains(taxonId)) { return null; } } // See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README // #Format: // tax_id // GeneID // Symbol // LocusTag // Synonyms // dbXrefs, separated by | // chromosome // map_location // description // type_of_gene // Symbol_from_nomenclature_authority // Full_name_from_nomenclature_authority // Nomenclature_status // Other_designations // Modification_date // Feature type geneInfo.setTaxId(taxonId); geneInfo.setGeneId(fields[1]); geneInfo.setDefaultSymbol(fields[2]); geneInfo.setLocusTag(fields[3]); String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|'); for (String synonym : synonyms) { if (synonym.equals("-")) continue; geneInfo.addToSynonyms(synonym); } if (!fields[5].equals("-")) { String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|'); for (String dbXr : dbXRefs) { String[] dbF = StringUtils.split(dbXr, ':'); if (dbF.length != 2) { /* * Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is * apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same * situation for MGI * * Therefore we have a special case. */ if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) { dbF[1] = dbF[1] + ":" + dbF[2]; } else { // we're very stringent to avoid data corruption. throw new FileFormatException( "Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'"); } } geneInfo.addToDbXRefs(dbF[0], dbF[1]); } } geneInfo.setChromosome(fields[6]); geneInfo.setMapLocation(fields[7]); geneInfo.setDescription(fields[8]); geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9])); geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-")); geneInfo.setNameIsFromAuthority(!fields[11].equals("-")); geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM); // ignore 14th field for now - it stores alternate protein names // ignore 15th, modification date } catch (NumberFormatException e) { throw new FileFormatException(e); } return geneInfo; } @Override public String getKey(NCBIGeneInfo newItem) { return newItem.getGeneId(); } @Override protected void put(String key, NCBIGeneInfo value) { try { if (resultsKeys != null) { resultsKeys.put(key); } results.put(key, value); } catch (InterruptedException e) { log.error(e); throw new RuntimeException(e); } } @Override public void parse(InputStream inputStream, BlockingQueue<String> queue) throws IOException { this.resultsKeys = queue; this.parse(inputStream); } public void setFilter(boolean filter) { this.filter = filter; } /** * @param ncbiTaxonIds Taxon IDs (NCBI, not Gemma ids) e.g. 9606 for H. sapiens */ public void setSupportedTaxa(Collection<Integer> ncbiTaxonIds) { this.ncbiTaxonIds = ncbiTaxonIds; } }