Java tutorial
package org.intermine.bio.dataconversion; /* * Copyright (C) 2002-2013 FlyMine * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. See the LICENSE file for more * information or http://www.gnu.org/copyleft/lesser.html. * */ import java.io.Reader; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.io.File; import java.io.FileReader; import java.io.BufferedReader; import java.util.Iterator; import java.util.List; import java.util.ArrayList; import org.intermine.dataconversion.ItemWriter; import org.intermine.metadata.Model; import org.intermine.xml.full.Item; import org.intermine.dataconversion.ItemWriter; import org.intermine.objectstore.ObjectStoreException; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.intermine.util.FormattedTextParser; import org.intermine.xml.full.ReferenceList; /** * * @author */ public class XenmineConverter extends BioDirectoryConverter { // private static final String DATASET_TITLE = "XenBase ftp site files"; private static final String DATA_SOURCE_NAME = "XenBase"; private static final Logger LOG = Logger.getLogger(XenmineConverter.class); private Map<String, String> chromosomes = new HashMap(); private Map<String, Item> genes = new HashMap(); private Map<String, Item> organisms = new HashMap(); private Map<String, HashSet> genesPageName = new HashMap(); private Map<String, String> genesAliases = new HashMap(); private Map<String, String> synonyms = new HashMap(); private Map<String, String> datasources = new HashMap(); private Map<String, Item> publications = new HashMap(); private Map<String, String> pmid_xbartid = new HashMap(); private Map<String, Item> terms = new HashMap(); //anatomy terms private Map<String, Item> imgs = new HashMap<String, Item>(); private static final String URL = "http://www.fruitfly.org/insituimages/insitu_images/thumbnails/"; private static final String TAXON_ID = "8364"; private static final String GENOME_BUILD = "JGI 8.0"; private static final String LAEVIS_BUILD = "JGI 7.1"; //private Item organism; private Map<String, String> dataSets = new LinkedHashMap<String, String>(); private static final String SCAFFOLD_MAPPING_FILE = "GenePageToJgiTropicalisScaffoldMapping_8.0.txt"; private static final String LEAVIS_SCAFFOLD_MAPPING_FILE = "GenePageToJgiLaevisScaffoldMapping_7.1.txt"; private static final String NAMES_FILE = "JgiToXenbaseGenePage_8.0.txt"; private static final String SYNONYMS_FILE = "GenePageGeneralInfo_ManuallyCurated.txt"; private static final String GENEPAGE_FILE = "XenbaseGenepageToGeneIdMapping.txt"; private static final String INTERACTIONS_FILE = "GenePageInteractants.txt"; private static final String LITERATURE_FILE = "LiteratureMatchedGenesByPaper.txt"; private static final String GO_FILE = "GenePageGoTerms.txt"; private static final String ENSEMBL_FILE = "GenePageEnsemblModelMapping.txt"; private static final String MOUSE_HOMOLOG_FILE = "XenbaseGeneMouseOrthologMapping.txt"; private static final String HUMAN_HOMOLOG_FILE = "XenbaseGeneHumanOrthologMapping.txt"; private static final String ZEBRAFISH_HOMOLOG_FILE = "XenbaseGeneZebrafishOrthologMapping.txt"; private static final String NON_ENTREZ_HOMOLOG_FILE = "XenbaseGeneNonEntrezOrthologMapping.txt"; private static final String EXPRESSION_FILE = "GeneExpression_ALL.txt"; private static final String NCBI_PROTEIN_FILE = "NcbiProteinXenbaseGene_ALL.txt"; private static final String NCBI_MRNA_FILE = "NcbiMrnaXenbaseGene_ALL.txt"; private static final String ENTREZ_UNIGENE_FILE = "GenePage_ALL_EntrezGeneUnigeneMapping.txt"; private static final String ANATOMY_MAPPING_FILE = "GenePageAnatomyOntologyMapping.txt"; protected String termClassName = "GOTerm"; protected String termCollectionName = "goAnnotation"; protected String annotationClassName = "GOAnnotation"; private Map<String, String> featureMap = new HashMap(); /** * Constructor * @param writer the ItemWriter used to handle the resultant items * @param model the Model */ public XenmineConverter(ItemWriter writer, Model model) throws ObjectStoreException { super(writer, model, DATA_SOURCE_NAME, DATASET_TITLE); } /** * * * {@inheritDoc} */ public void process(File dataDir) throws Exception { Map<String, File> files = readFilesInDir(dataDir); String[] requiredFiles = new String[] { SCAFFOLD_MAPPING_FILE, LEAVIS_SCAFFOLD_MAPPING_FILE, NAMES_FILE, SYNONYMS_FILE, GENEPAGE_FILE, INTERACTIONS_FILE, LITERATURE_FILE, GO_FILE, ENSEMBL_FILE, MOUSE_HOMOLOG_FILE, HUMAN_HOMOLOG_FILE, ZEBRAFISH_HOMOLOG_FILE, NON_ENTREZ_HOMOLOG_FILE, EXPRESSION_FILE, NCBI_PROTEIN_FILE, NCBI_MRNA_FILE, ENTREZ_UNIGENE_FILE, ANATOMY_MAPPING_FILE }; Set<String> missingFiles = new HashSet<String>(); for (String requiredFile : requiredFiles) { if (!files.containsKey(requiredFile)) { missingFiles.add(requiredFile); } } if (!missingFiles.isEmpty()) { throw new RuntimeException("Not all required files for the xenmine sources were found in: " + dataDir.getAbsolutePath() + ", was missing " + missingFiles); } //IDs and symbols processScaffoldMappingFile(new FileReader(files.get(SCAFFOLD_MAPPING_FILE)), GENOME_BUILD); processLaevisScaffoldMappingFile(new FileReader(files.get(LEAVIS_SCAFFOLD_MAPPING_FILE)), LAEVIS_BUILD); processNameFile(new FileReader(files.get(NAMES_FILE)), GENOME_BUILD); processGenePageFile(new FileReader(files.get(GENEPAGE_FILE))); processSynFile(new FileReader(files.get(SYNONYMS_FILE))); //annotation processLiteratureFile(new FileReader(files.get(LITERATURE_FILE))); processInteractionsFile(new FileReader(files.get(INTERACTIONS_FILE))); processGoFile(new FileReader(files.get(GO_FILE))); //cross-refs processEnsemblFile(new FileReader(files.get(ENSEMBL_FILE))); //homolog files processMouseOrthologFile(new FileReader(files.get(MOUSE_HOMOLOG_FILE))); processHumanOrthologFile(new FileReader(files.get(HUMAN_HOMOLOG_FILE))); processZebrafishOrthologFile(new FileReader(files.get(ZEBRAFISH_HOMOLOG_FILE))); processNonEntrezOrthologFile(new FileReader(files.get(NON_ENTREZ_HOMOLOG_FILE))); //expression files processExpressionFile(new FileReader(files.get(EXPRESSION_FILE))); //cross-refs processNcbiProteinFile(new FileReader(files.get(NCBI_PROTEIN_FILE))); processNcbiMrnaFile(new FileReader(files.get(NCBI_MRNA_FILE))); processUnigeneEntrezFile(new FileReader(files.get(ENTREZ_UNIGENE_FILE))); processAnatomyMappingFile(new FileReader(files.get(ANATOMY_MAPPING_FILE))); storeGenes(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processScaffoldMappingFile(Reader preader, String genomeBuild) throws Exception, ObjectStoreException { /* Xenbase gene ID * gene symbol * JGI Model Name * JGI Scaffold Name * JGI Scaffold Start Position * JGI Scaffold End Position * JGI Scaffold Strand * GenePageToJgiTropicalisScaffoldMapping_4.1.txt:XB-GENE-1021745 c1orf228 e_gw1.1.454.1 scaffold_1 4235 14986 -1 */ System.out.println("Processing scaffold file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } Item organism = createItem("Organism"); organism.setAttribute("taxonId", TAXON_ID); organism.setAttribute("genus", "Xenopus"); organism.setAttribute("species", "tropicalis"); organism.setAttribute("name", "Xenopus tropicalis"); organism.setAttribute("shortName", "X. tropicalis"); store(organism); while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 7) { LOG.error("Couldn't process line. Expected 7 cols, but was " + line.length); continue; } String xbGeneId = line[0].trim(); String symbol = line[1].trim(); String jgimodelName = line[2].trim(); String chromosome = line[3].trim(); String start = line[4].trim(); String end = line[5].trim(); String strand = line[6].trim(); String length = getLength(start, end); Item gene = createItem("Gene"); gene.setAttribute("primaryIdentifier", jgimodelName); gene.setAttribute("secondaryIdentifier", xbGeneId); gene.setAttribute("symbol", symbol); gene.setAttribute("genomeBuild", genomeBuild); String chrRefId = getChromosome(chromosome, organism.getIdentifier()); gene.setReference("chromosome", chrRefId); String locationRefId = getLocation(gene, chrRefId, start, end, strand); gene.setReference("chromosomeLocation", locationRefId); gene.setAttribute("jgiModelName", jgimodelName); gene.setReference("organism", organism); genes.put(xbGeneId, gene); } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processLaevisScaffoldMappingFile(Reader preader, String genomeBuild) throws Exception, ObjectStoreException { /* Xenbase gene ID * gene symbol * JGI Model Name * JGI Scaffold Name * JGI Scaffold Start Position * JGI Scaffold End Position * JGI Scaffold Strand * XB-GENE-478732 impdh2 XeXenL6RMv10052779m.g Scaffold87688 113849 139524 -1 */ System.out.println("Processing laevis scaffold file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } Item organism = createItem("Organism"); organism.setAttribute("taxonId", "8355"); organism.setAttribute("genus", "Xenopus"); organism.setAttribute("species", "laevis"); organism.setAttribute("name", "Xenopus laevis"); organism.setAttribute("shortName", "X. laevis"); store(organism); while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 7) { LOG.error("Couldn't process line. Expected 7 cols, but was " + line.length); continue; } String xbGeneId = line[0].trim(); String symbol = line[1].trim(); String jgimodelName = line[2].trim(); String chromosome = line[3].trim(); String start = line[4].trim(); String end = line[5].trim(); String strand = line[6].trim(); String length = getLength(start, end); Item gene = createItem("Gene"); gene.setAttribute("primaryIdentifier", jgimodelName); gene.setAttribute("secondaryIdentifier", xbGeneId); if (!symbol.equalsIgnoreCase("unnamed")) gene.setAttribute("symbol", symbol); gene.setAttribute("genomeBuild", genomeBuild); String chrRefId = getChromosome(chromosome, organism.getIdentifier()); gene.setReference("chromosome", chrRefId); String locationRefId = getLocation(gene, chrRefId, start, end, strand); gene.setReference("chromosomeLocation", locationRefId); gene.setAttribute("jgiModelName", jgimodelName); gene.setReference("organism", organism); genes.put(xbGeneId, gene); } preader.close(); } private Map<String, File> readFilesInDir(File dir) { Map<String, File> files = new HashMap<String, File>(); for (File file : dir.listFiles()) { files.put(file.getName(), file); } return files; } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processNameFile(Reader preader, String genomeBuild) throws Exception, ObjectStoreException { /* model name * Xenbase tropicalis gene ID * gene symbol * gene name */ System.out.println("Processing Names file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String jgimodelname = line[0].trim(); String primaryIdentifier = line[1].trim(); String symbol = line[2].trim(); String name = line[3].trim(); Item gene = genes.get(primaryIdentifier); if (gene != null) { gene.setAttribute("name", name); } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processNcbiProteinFile(Reader preader) throws Exception, ObjectStoreException { /* gi * accession * Xenbase gene ID * gene symbol * 1000735 AAA84444 XB-GENE-865674 gsk3b */ System.out.println("Processing NCBI-PROTEIN file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String gi = line[0].trim(); String accession = line[1].trim(); String xenbaseGeneId = line[2].trim(); String symbol = line[3].trim(); //System.out.println("identifier.." + gi + " accession " + accession); Item gene = genes.get(xenbaseGeneId); if (gene != null) { getCrossReference(gene.getIdentifier(), gi, accession, "NCBI Protein"); } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processUnigeneEntrezFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase gene ID * gene symbol * Entrez ID * Unigene ID * XB-GENE-478054 trnt1 394602 Str.7616 */ System.out.println("Processing UNIGENE ENTREZ file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String xenbaseGeneId = line[0].trim(); String symbol = line[1].trim(); String entrezId = line[2].trim(); String unigeneId = line[3].trim(); Item gene = genes.get(xenbaseGeneId); if (gene != null) { if (StringUtils.isNotEmpty(entrezId)) getCrossReference(gene.getIdentifier(), entrezId, "", "Entrez Gene ID"); if (StringUtils.isNotEmpty(unigeneId)) getCrossReference(gene.getIdentifier(), unigeneId, "", "Unigene ID"); } } preader.close(); } private String getCrossReference(String subjectId, String id, String accession, String source) throws ObjectStoreException { String refId = ""; Item crf = createItem("CrossReference"); crf.setReference("subject", subjectId); crf.setAttribute("identifier", id); if (StringUtils.isNotEmpty(accession)) { crf.setAttribute("accession", accession); } String dsId = datasources.get(source); if (dsId == null) { Item ds = createItem("DataSource"); ds.setAttribute("name", source); ds.setAttribute("url", "http://www.ncbi.nlm.nih.gov/entrez/"); try { store(ds); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } crf.setReference("source", ds.getIdentifier()); datasources.put(source, ds.getIdentifier()); } else { crf.setReference("source", dsId); } try { store(crf); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } refId = crf.getIdentifier(); return refId; } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processNcbiMrnaFile(Reader preader) throws Exception, ObjectStoreException { /* gi * accession * Xenbase gene ID * gene symbol * 1000735 AAA84444 XB-GENE-865674 gsk3b */ System.out.println("Processing NCBI-MRNA file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String gi = line[0].trim(); String accession = line[1].trim(); String xenbaseGeneId = line[2].trim(); String symbol = line[3].trim(); Item gene = genes.get(xenbaseGeneId); if (gene != null) { getCrossReference(gene.getIdentifier(), gi, accession, "NCBI mRNA"); } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processInteractionsFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase gene ID * gene symbol * interactant ID:interactant symbol:co-citation occurrence */ System.out.println("Processing Interactants file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 3) { LOG.error("Couldn't process line. Expected 3 cols, but was " + line.length); continue; } String genePageId = line[0].trim().substring(12); String symbol = line[1].trim(); String interactants = line[2].trim(); //genePageId-PMID:symbol:someNumber if (interactants == null || interactants.length() == 0 || interactants.equals("")) { continue; } String[] interactingGenes = interactants.split(","); //System.out.println("Gene Page.."+ genePageId + " Length of inG.." + interactingGenes.length); HashSet geneIds = genesPageName.get(genePageId); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene = genes.get(geneId); if (interactingGenes.length != 0) { for (int i = 0; i < interactingGenes.length; i++) { String gpid[] = interactingGenes[i].split("\\:"); String genePageIdInteractant = gpid[0]; //System.out.println("gene interactant.." + genePageIdInteractant); HashSet interactingGeneIds = genesPageName.get(genePageIdInteractant); if (interactingGeneIds == null) { continue; } Iterator itt = interactingGeneIds.iterator(); while (itt.hasNext()) { String interactingGeneId = (String) itt.next(); Item interactingGene = genes.get(interactingGeneId); if (gene != null && interactingGene != null) { getInteraction(gene.getIdentifier(), interactingGene.getIdentifier()); } } } } } } preader.close(); } private void getInteraction(String refId, String gene2RefId) throws ObjectStoreException { //MultiKey key = new MultiKey(refId, gene2RefId); //Item interaction = interactionsnew.get(key); //if (interaction == null) { Item interaction = createItem("Interaction"); interaction.setReference("gene1", refId); interaction.setReference("gene2", gene2RefId); //interactionsnew.put(key, interaction); store(interaction); //} //return interaction; } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processExpressionFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase Gene ID - XB-GENE-865139 * Symbol - nr5a2-a * Genotype - wild type * Anatomical Tissue - XAO:0000132 liver and biliary system,XAO:0000467 dorsal pancreatic bud,XAO:0001103 ventral pancreatic bud,XAO:0003266 liver primordium * Start Stage - XAO:1000049 NF stage 35 and 36 * End Stge - XAO:1000053 NF stage 41 * assay - in situ hybridization * EvidenceID - Image - XB-IMG-26168 * Expreriment ID - XB-EXP-6382 * Source - Published * Literature ID - XB-ART-39030 * Curation Status - Complete manual curation */ System.out.println("Processing Expression file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); //getCurrentFile() } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 12) { LOG.error("Couldn't process line. Expected 12 cols, but was " + line.length); continue; } String xenbaseGeneId = line[0].trim(); String symbol = line[1].trim(); String genotype = line[2].trim(); String anatomicalTissue = line[3].trim(); String startStageFull = line[4].trim(); //XAO:1000049 NF stage 35 and 36 String[] t = startStageFull.split("\\s"); String startStage = t[0]; String endStageFull = line[5].trim(); String[] te = endStageFull.split("\\s"); String endStage = te[0]; String assay = line[6].trim(); String imageId = line[7].trim(); String experimentId = line[8].trim(); String source = line[9].trim(); String literatureId = line[10].trim(); //XB-ART-ID figure out how to merge based on PMID String pmid = pmid_xbartid.get(literatureId); String refId = getPub(pmid, literatureId); String status = line[11].trim(); //figure out how to store this stuff Item expression = createItem("ExpressionResult"); if (StringUtils.isNotEmpty(genotype)) { expression.setAttribute("genotype", genotype); } if (StringUtils.isNotEmpty(assay)) { expression.setAttribute("assay", assay); } if (StringUtils.isNotEmpty(experimentId)) { expression.setAttribute("experimentId", experimentId); } if (StringUtils.isNotEmpty(source)) { expression.setAttribute("source", source); } if (StringUtils.isNotEmpty(status)) { expression.setAttribute("curationStatus", status); } Item sterm = getTerm(startStage); Item eterm = getTerm(endStage); expression.setReference("startStage", sterm.getIdentifier()); expression.setReference("endStage", eterm.getIdentifier()); expression.setReference("publication", refId); setImage(expression, URL + imageId + ".jpg"); setAnatomicalTissues(expression, anatomicalTissue, "anatomicalStages"); Item gene = genes.get(xenbaseGeneId); if (gene != null) { //XB-GENE-478125 is not in other files expression.setReference("gene", gene.getIdentifier()); store(expression); //don't store all other items created if there is no gene to associate with..? } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processAnatomyMappingFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase Gene ID * gene symbol * start stage * end stage * tissues * XB-GENEPAGE-478053 trnt1 XAO:1000031 NF stage 10.5 XAO:1000009 frog * XAO:0000001 ectoderm,XAO:0003024 head,XAO:0000129 intestine,XAO:0000133 liver,XAO:0000256 oocyte, * XAO:0000258 ovary,XAO:0000157 testis,XAO:0003004 whole organism */ System.out.println("Processing Anatomy Mapping file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); //getCurrentFile() } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); String xenbaseGeneId = line[0].trim().substring(12); String symbol = line[1].trim(); String startStageFull = line[2].trim(); //XAO:1000049 NF stage 35 and 36 String[] t = startStageFull.split("\\s"); String startStage = t[0]; String endStageFull = line[3].trim(); String[] te = endStageFull.split("\\s"); String endStage = te[0]; String anatomicalTissue = line[4].trim(); System.out.println(xenbaseGeneId); HashSet geneIds = genesPageName.get(xenbaseGeneId); //check this stuff with what is in genes..a thorough once over Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene = genes.get(geneId); if (gene != null) { //figure out how to store this stuff Item anatomy = createItem("AnatomyMapping"); Item sterm = getTerm(startStage); Item eterm = getTerm(endStage); anatomy.setReference("startStage", sterm.getIdentifier()); anatomy.setReference("endStage", eterm.getIdentifier()); setAnatomicalTissues(anatomy, anatomicalTissue, "tissues"); anatomy.setReference("gene", gene.getIdentifier()); store(anatomy); //don't store all other items created if there is no gene to associate with..? } } } preader.close(); } private void setImage(Item result, String img) throws ObjectStoreException { Item item = imgs.get(img); if (item == null) { item = createItem("Image"); item.setAttribute("url", img); imgs.put(img, item); store(item); result.setReference("image", item.getIdentifier()); } else { result.setReference("image", item.getIdentifier()); } } private void setAnatomicalTissues(Item result, String at, String collectionName) throws ObjectStoreException { //XAO:0000132 liver and biliary system, XAO:0000467 //dorsal pancreatic bud, XAO:0001103 ventral pancreatic bud, XAO:0003266 liver primordium String[] temp = at.split(","); for (int i = 0; i < temp.length; i++) { String[] t2 = temp[i].split("\\s"); String term = t2[0]; Item aterm = getTerm(term); if (aterm != null) { result.addToCollection(collectionName, aterm.getIdentifier()); } } } /** * * @param term * @return * @throws ObjectStoreException */ private Item getTerm(String term) throws ObjectStoreException { Item storedRef = terms.get(term); if (storedRef == null) { storedRef = createItem("XenopusAnatomyTerm"); if (StringUtils.isNotEmpty(term)) { storedRef.setAttribute("identifier", term); } store(storedRef); terms.put(term, storedRef); } return storedRef; } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processLiteratureFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase Literature ID * PMID * Xenbase genePage IDs associated with the PMID above */ System.out.println("Processing Literature file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); //getCurrentFile() } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 3) { LOG.error("Couldn't process line. Expected 3 cols, but was " + line.length); continue; } String literatureId = line[0].trim().substring(7); String pmid = line[1].trim(); String litGenes = line[2].trim(); //XB-GENEPAGE-491748 elavl2,XB-GENEPAGE-481418 gdf1,XB-GENEPAGE-481799 elavl1 if (litGenes.isEmpty() || litGenes == null) { System.out.println("empty 3rd column."); continue; } String storedRefId = getPub(pmid, literatureId); String[] gids = litGenes.split(","); if (gids.length != 0) { for (int i = 0; i < gids.length; i++) { String gpid[] = gids[i].split("\\s"); String genePageId = gpid[0].substring(12); HashSet geneIds = genesPageName.get(genePageId); //check this stuff with what is in genes..a thorough once over Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene = genes.get(geneId); if (gene != null) { gene.addToCollection("publications", storedRefId); } } } } else { String gpid[] = litGenes.split("\\s"); String genePageId = gpid[0].substring(12); HashSet geneIds = genesPageName.get(genePageId); //check this stuff with what is in genes..a thorough once over Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene = genes.get(geneId); if (gene != null) { gene.addToCollection("publications", storedRefId); } } } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processGoFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase genepage ID * gene symbol * GO Ids (comma separated) */ System.out.println("Processing GO file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); //getCurrentFile() } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 3) { LOG.error("Couldn't process line. Expected 3 cols, but was " + line.length); continue; } String genePageId = line[0].trim().substring(12); String symbol = line[1].trim(); String goids = line[2].trim(); //GO:0005525,GO:0007264,GO:0015031 HashSet geneIds = genesPageName.get(genePageId); if (geneIds == null) { System.out.println("genePageId: null for genes.."); continue; } Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene = genes.get(geneId); //System.out.println("genePageId: " + genePageId + " geneId " + geneId); String[] gids = goids.split(","); for (int i = 0; i < gids.length; i++) { String goTermIdentifier = gids[i]; if (gene != null) { createGoAnnotation(gene.getIdentifier(), gene, goTermIdentifier, "XenBase", "XenBase"); } } } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processEnsemblFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase gene ID * gene symbol * description * ENSXETG00000xxxx - ensembl ID */ System.out.println("Processing Ensembl file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); //getCurrentFile() } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String geneId = line[0].trim(); String symbol = line[1].trim(); String name = line[2].trim(); String ensemblId = line[3].trim(); Item gene = genes.get(geneId); if (gene != null && ensemblId != null) { gene.setAttribute("ensemblIdentifier", ensemblId); } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processNonEntrezOrthologFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase * OMIM * MGI * ZFIN */ System.out.println("Processing Non Entrze Orthologs.. XenbaseGeneNonEntrezOrthologMapping.txt file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } String prevXenopusId = ""; while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } System.out.println(line[0].trim()); String xenopusId = line[0].trim().substring(8); //it doesn;t say GENEPAGE!! String omimId = line[1].trim(); String mgiId = line[2].trim(); String zfinId = line[3].trim(); String chickenId = line[4].trim(); if (prevXenopusId == xenopusId) { HashSet geneIds = genesPageName.get(xenopusId); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene0 = genes.get(geneId); if (gene0 != null) { if (!StringUtils.isEmpty(zfinId)) { String gene3 = getGene(zfinId, "7955"); if (gene3 != null) processHomologues(gene0.getIdentifier(), gene3); } } } } else { HashSet geneIds = genesPageName.get(xenopusId); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene0 = genes.get(geneId); if (gene0 != null) { if (!StringUtils.isEmpty(omimId)) { String gene1 = getGene(omimId, "9606"); if (gene1 != null) processHomologues(gene0.getIdentifier(), gene1); } if (!StringUtils.isEmpty(mgiId)) { String gene2 = getGene(mgiId, "10090"); if (gene2 != null) processHomologues(gene0.getIdentifier(), gene2); } if (!StringUtils.isEmpty(zfinId)) { String gene3 = getGene(zfinId, "7955"); if (gene3 != null) processHomologues(gene0.getIdentifier(), gene3); } if (!StringUtils.isEmpty(chickenId)) { String gene4 = getGene(chickenId, "9031"); if (gene4 != null) processHomologues(gene0.getIdentifier(), gene4); } } } } prevXenopusId = xenopusId; } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processZebrafishOrthologFile(Reader preader) throws Exception, ObjectStoreException { /* entrez/NCBI gene ID * Xenbase GenePage * symbol * name */ System.out.println("Processing ZebraFish Ortholog file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String humanIdentifier = line[0].trim(); String xenopusIdentifier = line[1].trim().substring(12); if (StringUtils.isEmpty(xenopusIdentifier) || StringUtils.isEmpty(humanIdentifier)) { continue; } String gene2 = getGene(humanIdentifier, "7955"); HashSet geneIds = genesPageName.get(xenopusIdentifier); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene1 = genes.get(geneId); if (gene1 != null && gene2 != null) { //lot of trouble..why should this check be required..does not make sense..spend time debug processHomologues(gene1.getIdentifier(), gene2); } } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processHumanOrthologFile(Reader preader) throws Exception, ObjectStoreException { /* entrez/NCBI gene ID * Xenbase GenePage * symbol * name */ System.out.println("Processing Human Ortholog file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String humanIdentifier = line[0].trim(); String xenopusIdentifier = line[1].trim().substring(12); if (StringUtils.isEmpty(xenopusIdentifier) || StringUtils.isEmpty(humanIdentifier)) { continue; } String gene2 = getGene(humanIdentifier, "9606"); HashSet geneIds = genesPageName.get(xenopusIdentifier); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene1 = genes.get(geneId); if (gene1 != null && gene2 != null) { //lot of trouble..why should this check be required..does not make sense..spend time debug processHomologues(gene1.getIdentifier(), gene2); } } } preader.close(); } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processMouseOrthologFile(Reader preader) throws Exception, ObjectStoreException { /* entrez/NCBI gene ID * Xenbase GenePage * symbol * name */ System.out.println("Processing Mouse Ortholog file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 4) { LOG.error("Couldn't process line. Expected 4 cols, but was " + line.length); continue; } String mouseIdentifier = line[0].trim(); String xenopusIdentifier = line[1].trim().substring(12); if (StringUtils.isEmpty(xenopusIdentifier) || StringUtils.isEmpty(mouseIdentifier)) { continue; } String gene2 = getGene(mouseIdentifier, "10090"); HashSet geneIds = genesPageName.get(xenopusIdentifier); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene1 = genes.get(geneId); if (gene1 != null && gene2 != null) { //lot of trouble..why should this check be required..does not make sense..spend time debug processHomologues(gene1.getIdentifier(), gene2); } } } preader.close(); } /** * * @param productIdentifier * @param productType * @param termIdentifier * @param organism * @param dataSource * @param dataSourceCode * @throws ObjectStoreException */ private void createGoAnnotation(String productIdentifier, Item product, String termIdentifier, String dataSource, String dataSourceCode) throws ObjectStoreException { Item goTerm = createItem(termClassName); goTerm.setAttribute("identifier", termIdentifier); store(goTerm); Item goAnnotation = createItem(annotationClassName); goAnnotation.setReference("subject", productIdentifier); //goAnnotation.setReference("ontologyTerm", termIdentifier); goAnnotation.setReference("ontologyTerm", goTerm.getIdentifier()); product.addToCollection("goAnnotation", goAnnotation.getIdentifier()); store(goAnnotation); } /** * * @param dataSource * @param code * @return * @throws ObjectStoreException */ private String getDataset(String dataSource, String code) throws ObjectStoreException { String dataSetIdentifier = dataSets.get(code); if (dataSetIdentifier == null) { String title = "GO Annotation from " + dataSource; Item item = createItem("DataSet"); item.setAttribute("name", title); item.setReference("dataSource", dataSource); dataSetIdentifier = item.getIdentifier(); dataSets.put(code, dataSetIdentifier); store(item); } return dataSetIdentifier; } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processGenePageFile(Reader preader) throws Exception, ObjectStoreException { /* XB-GENEPAGE ID * XB-GENE IDs for tropicalis and laevis * The XB-GENE IDs are comma-separated * It seems like first one is tropicals, rest are laevis (a & b) */ System.out.println("Processing Gene Page file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 2) { LOG.error("Couldn't process line. Expected 2 cols, but was " + line.length); continue; } String genePageName = line[0].trim().substring(12); String geneIds = line[1].trim(); if (geneIds.indexOf(",") != -1) { String[] refs = geneIds.split(","); for (int i = 0; i < refs.length; i++) { addGenePageId(refs[i], genePageName); } } else { addGenePageId(geneIds, genePageName); } } preader.close(); //print out contents for (Map.Entry<String, HashSet> entry : genesPageName.entrySet()) { System.out.printf("Key : %s and Value: %s %n", entry.getKey(), entry.getValue()); } } /** * * @param Id * @throws Exception * @throws ObjectStoreException */ private void addGenePageId(String Id, String genePageName) throws Exception, ObjectStoreException { Item gene = genes.get(Id); if (gene != null) { gene.setAttribute("xenbaseGeneId", genePageName); } if (genesPageName.containsKey(new String(genePageName))) { HashSet old = (HashSet) genesPageName.get(new String(genePageName)); old.add(new String(Id)); genesPageName.put(new String(genePageName), old); } else { HashSet set = new HashSet(); set.add(new String(Id)); genesPageName.put(new String(genePageName), set); } } /** * * @param reader * @throws Exception * @throws ObjectStoreException */ private void processSynFile(Reader preader) throws Exception, ObjectStoreException { /* Xenbase gene ID * gene symbol * gene name * gene function -- will store as description * gene synonyms * JGI ID -- missing from the file --not required */ System.out.println("Processing Synonym file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new Exception("cannot parse file: " + preader.toString(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 5) { LOG.error("Couldn't process line. Expected 5 cols, but was " + line.length); continue; } String genePageId = line[0].trim().substring(12); String symbol = line[1].trim(); String name = line[2].trim(); String desc = line[3].trim(); String synonyms = line[4].trim(); HashSet geneIds = genesPageName.get(genePageId); Iterator it = geneIds.iterator(); while (it.hasNext()) { String geneId = (String) it.next(); Item gene = genes.get(geneId); if (gene != null) { if (desc != null && !StringUtils.isEmpty(desc)) { gene.setAttribute("briefDescription", desc); } if (synonyms != null && !StringUtils.isEmpty(synonyms)) { gene.setAttribute("alias", synonyms); if (synonyms.indexOf("|") != -1) { String[] syns = synonyms.split("\\|"); for (int i = 0; i < syns.length; i++) { getSynonym(gene.getIdentifier(), syns[i]); } } else { getSynonym(gene.getIdentifier(), synonyms); } } } else { System.out .println("gene page id for a gene that is not laoded in the prev file.." + genePageId); } } } preader.close(); } /** * * @param subjectId * @param value * @return * @throws ObjectStoreException */ private String getSynonym(String subjectId, String value) throws ObjectStoreException { if (StringUtils.isEmpty(value)) { return null; } String refId = synonyms.get(value); if (refId == null) { Item syn = createItem("Synonym"); syn.setReference("subject", subjectId); syn.setAttribute("value", value); refId = syn.getIdentifier(); try { store(syn); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } } return refId; } /** * * @param identifier * @return * @throws ObjectStoreException */ private String getChromosome(String identifier, String orgId) throws ObjectStoreException { if (StringUtils.isEmpty(identifier)) { return null; } String refId = chromosomes.get(identifier); if (refId == null) { Item item = createItem("Chromosome"); item.setAttribute("primaryIdentifier", identifier); item.setReference("organism", orgId); refId = item.getIdentifier(); chromosomes.put(identifier, refId); try { store(item); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } } return refId; } /** * * @param subject * @param chromosomeRefId * @param startCoord * @param stopCoord * @param strand * @return * @throws ObjectStoreException */ private String getLocation(Item subject, String chromosomeRefId, String startCoord, String stopCoord, String strand) throws ObjectStoreException { String start = startCoord; String end = stopCoord; if (!StringUtils.isEmpty(start) && !StringUtils.isEmpty(end)) { subject.setAttribute("length", getLength(start, end)); } Item location = createItem("Location"); if (!StringUtils.isEmpty(start)) location.setAttribute("start", start); if (!StringUtils.isEmpty(end)) location.setAttribute("end", end); if (!StringUtils.isEmpty(strand)) location.setAttribute("strand", strand); location.setReference("feature", subject); location.setReference("locatedOn", chromosomeRefId); try { store(location); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } return location.getIdentifier(); } /** * * @param start * @param end * @return * @throws NumberFormatException */ private String getLength(String start, String end) throws NumberFormatException { Integer a = new Integer(start); Integer b = new Integer(end); if (a.compareTo(b) > 0) { a = new Integer(end); b = new Integer(start); } Integer length = new Integer(b.intValue() - a.intValue()); return length.toString(); } /** * * @param subjectId * @param id * @param source * @return * @throws ObjectStoreException */ private String getCrossReference(String subjectId, String id, String source) throws ObjectStoreException { String refId = ""; Item crf = createItem("CrossReference"); crf.setReference("subject", subjectId); crf.setAttribute("identifier", id); //crf.setAttribute("dbxrefsource", source); String dsId = datasources.get(source); if (dsId == null) { Item ds = createItem("DataSource"); ds.setAttribute("name", source); try { store(ds); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } crf.setReference("source", ds.getIdentifier()); datasources.put(source, ds.getIdentifier()); } else { crf.setReference("source", dsId); } try { store(crf); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } refId = crf.getIdentifier(); return refId; } private String getPub(String pubMedId, String literatureId) throws ObjectStoreException { Item storedRef = publications.get(pubMedId); if (storedRef == null) { storedRef = createItem("Publication"); if (StringUtils.isNotEmpty(pubMedId)) { storedRef.setAttribute("pubMedId", pubMedId); storedRef.setAttribute("DbPubId", literatureId); } store(storedRef); publications.put(pubMedId, storedRef); pmid_xbartid.put(literatureId, pubMedId); } return storedRef.getIdentifier(); } /** * * @param geneId * @param taxonId * @return * @throws ObjectStoreException */ private String getGene(String geneId, String taxonId) throws ObjectStoreException { String identifierType = "primaryIdentifier"; Item gene = genes.get(geneId); if (gene == null) { gene = createItem("Gene"); gene.setAttribute(identifierType, geneId); if (taxonId != null) { gene.setReference("organism", getOrganism(taxonId)); } genes.put(geneId, gene); } return gene.getIdentifier(); } /** * * @param gene1 * @param gene2 * @throws ObjectStoreException */ private void processHomologues(String gene1, String gene2) throws ObjectStoreException { if (gene1 == null || gene2 == null) { return; } Item homologue = createItem("Homologue"); homologue.setReference("gene", gene1); homologue.setReference("homologue", gene2); store(homologue); } /** * * @throws ObjectStoreException */ private void storeGenes() throws ObjectStoreException { for (Item gene : genes.values()) { try { store(gene); } catch (ObjectStoreException e) { throw new ObjectStoreException(e); } } } }