List of usage examples for org.jdom2 Element getChild
public Element getChild(final String cname)
From source file:com.bio4j.neo4jdb.programs.ImportUniprot.java
License:Open Source License
public static void main(String[] args) { if (args.length != 4) { System.out.println("This program expects the following parameters: \n" + "1. Uniprot xml filename \n" + "2. Bio4j DB folder \n" + "3. batch inserter .properties file \n" + "4. Config XML file"); } else {/* w ww .j a v a2 s. c om*/ long initTime = System.nanoTime(); File inFile = new File(args[0]); File configFile = new File(args[3]); String currentAccessionId = ""; BatchInserter inserter = null; BatchInserterIndexProvider indexProvider = null; BufferedWriter enzymeIdsNotFoundBuff = null; BufferedWriter statsBuff = null; int proteinCounter = 0; int limitForPrintingOut = 10000; try { // This block configures the logger with handler and formatter fh = new FileHandler("ImportUniprot" + args[0].split("\\.")[0] + ".log", false); SimpleFormatter formatter = new SimpleFormatter(); fh.setFormatter(formatter); logger.addHandler(fh); logger.setLevel(Level.ALL); System.out.println("Reading conf file..."); BufferedReader reader = new BufferedReader(new FileReader(configFile)); String line; StringBuilder stBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { stBuilder.append(line); } reader.close(); UniprotDataXML uniprotDataXML = new UniprotDataXML(stBuilder.toString()); //---creating writer for enzymes not found file----- enzymeIdsNotFoundBuff = new BufferedWriter(new FileWriter(new File("EnzymeIdsNotFound.log"))); //---creating writer for stats file----- statsBuff = new BufferedWriter(new FileWriter( new File("ImportUniprotStats_" + inFile.getName().split("\\.")[0] + ".txt"))); // create the batch inserter inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2]))); // create the batch index service indexProvider = new LuceneBatchInserterIndexProvider(inserter); //-----------------create batch indexes---------------------------------- //---------------------------------------------------------------------- BatchInserterIndex proteinAccessionIndex = indexProvider.nodeIndex( ProteinNode.PROTEIN_ACCESSION_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex proteinFullNameFullTextIndex = indexProvider.nodeIndex( ProteinNode.PROTEIN_FULL_NAME_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex proteinGeneNamesFullTextIndex = indexProvider.nodeIndex( ProteinNode.PROTEIN_GENE_NAMES_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex proteinEnsemblPlantsIndex = indexProvider.nodeIndex( ProteinNode.PROTEIN_ENSEMBL_PLANTS_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex datasetNameIndex = indexProvider.nodeIndex(DatasetNode.DATASET_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex keywordIdIndex = indexProvider.nodeIndex(KeywordNode.KEYWORD_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex keywordNameIndex = indexProvider.nodeIndex(KeywordNode.KEYWORD_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex interproIdIndex = indexProvider.nodeIndex(InterproNode.INTERPRO_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex pfamIdIndex = indexProvider.nodeIndex(PfamNode.PFAM_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex goTermIdIndex = indexProvider.nodeIndex(GoTermNode.GO_TERM_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex organismScientificNameIndex = indexProvider.nodeIndex( OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex organismNcbiTaxonomyIdIndex = indexProvider.nodeIndex( OrganismNode.ORGANISM_NCBI_TAXONOMY_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex taxonNameIndex = indexProvider.nodeIndex(TaxonNode.TAXON_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex genomeElementVersionIndex = indexProvider.nodeIndex( GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex reactomeTermIdIndex = indexProvider.nodeIndex( ReactomeTermNode.REACTOME_TERM_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex enzymeIdIndex = indexProvider.nodeIndex(EnzymeNode.ENZYME_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex mainNodesIndex = indexProvider.nodeIndex(Bio4jManager.MAIN_NODES_INDEX_NAME, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); //---------------------------------------------------------------------- //---------------------------------------------------------------------- reader = new BufferedReader(new FileReader(inFile)); StringBuilder entryStBuilder = new StringBuilder(); //---------------------------------------------------------------------- //------------------------looking up for main nodes--------------------- alternativeProductInitiationId = mainNodesIndex .get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_INITIATION) .getSingle(); alternativeProductPromoterId = mainNodesIndex .get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_PROMOTER) .getSingle(); alternativeProductSplicingId = mainNodesIndex .get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_SPLICING) .getSingle(); alternativeProductRibosomalFrameshiftingId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.ALTERNATIVE_PRODUCT_RIBOSOMAL_FRAMESHIFTING).getSingle(); seqCautionErroneousInitiationId = mainNodesIndex .get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_INITIATION) .getSingle(); seqCautionErroneousTranslationId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_TRANSLATION).getSingle(); seqCautionFrameshiftId = mainNodesIndex .get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_FRAMESHIFT) .getSingle(); seqCautionErroneousTerminationId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_TERMINATION).getSingle(); seqCautionMiscellaneousDiscrepancyId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_MISCELLANEOUS_DISCREPANCY).getSingle(); seqCautionErroneousGeneModelPredictionId = mainNodesIndex.get(Bio4jManager.MAIN_NODES_INDEX_NAME, Bio4jManager.SEQUENCE_CAUTION_ERRONEOUS_GENE_MODEL_PREDICTION).getSingle(); //---------------------------------------------------------------------- //---------------------------------------------------------------------------------- //---------------------initializing node type properties---------------------------- organismProperties.put(OrganismNode.NODE_TYPE_PROPERTY, OrganismNode.NODE_TYPE); proteinProperties.put(ProteinNode.NODE_TYPE_PROPERTY, ProteinNode.NODE_TYPE); keywordProperties.put(KeywordNode.NODE_TYPE_PROPERTY, KeywordNode.NODE_TYPE); subcellularLocationProperties.put(SubcellularLocationNode.NODE_TYPE_PROPERTY, SubcellularLocationNode.NODE_TYPE); interproProperties.put(InterproNode.NODE_TYPE_PROPERTY, InterproNode.NODE_TYPE); pfamProperties.put(PfamNode.NODE_TYPE_PROPERTY, PfamNode.NODE_TYPE); taxonProperties.put(TaxonNode.NODE_TYPE_PROPERTY, TaxonNode.NODE_TYPE); datasetProperties.put(DatasetNode.NODE_TYPE_PROPERTY, DatasetNode.NODE_TYPE); personProperties.put(PersonNode.NODE_TYPE_PROPERTY, PersonNode.NODE_TYPE); consortiumProperties.put(ConsortiumNode.NODE_TYPE_PROPERTY, ConsortiumNode.NODE_TYPE); instituteProperties.put(InstituteNode.NODE_TYPE_PROPERTY, InstituteNode.NODE_TYPE); thesisProperties.put(ThesisNode.NODE_TYPE_PROPERTY, ThesisNode.NODE_TYPE); bookProperties.put(BookNode.NODE_TYPE_PROPERTY, BookNode.NODE_TYPE); patentProperties.put(PatentNode.NODE_TYPE_PROPERTY, PatentNode.NODE_TYPE); articleProperties.put(ArticleNode.NODE_TYPE_PROPERTY, ArticleNode.NODE_TYPE); submissionProperties.put(SubmissionNode.NODE_TYPE_PROPERTY, SubmissionNode.NODE_TYPE); onlineArticleProperties.put(OnlineArticleNode.NODE_TYPE_PROPERTY, OnlineArticleNode.NODE_TYPE); unpublishedObservationProperties.put(UnpublishedObservationNode.NODE_TYPE_PROPERTY, UnpublishedObservationNode.NODE_TYPE); publisherProperties.put(PublisherNode.NODE_TYPE_PROPERTY, PublisherNode.NODE_TYPE); cityProperties.put(CityNode.NODE_TYPE_PROPERTY, CityNode.NODE_TYPE); journalProperties.put(JournalNode.NODE_TYPE_PROPERTY, JournalNode.NODE_TYPE); onlineJournalProperties.put(OnlineJournalNode.NODE_TYPE_PROPERTY, OnlineJournalNode.NODE_TYPE); countryProperties.put(CountryNode.NODE_TYPE_PROPERTY, CountryNode.NODE_TYPE); isoformProperties.put(IsoformNode.NODE_TYPE_PROPERTY, IsoformNode.NODE_TYPE); commentTypeProperties.put(CommentTypeNode.NODE_TYPE_PROPERTY, CommentTypeNode.NODE_TYPE); featureTypeProperties.put(FeatureTypeNode.NODE_TYPE_PROPERTY, FeatureTypeNode.NODE_TYPE); //----------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------- while ((line = reader.readLine()) != null) { if (line.trim().startsWith("<" + UniprotStuff.ENTRY_TAG_NAME)) { while (!line.trim().startsWith("</" + UniprotStuff.ENTRY_TAG_NAME + ">")) { entryStBuilder.append(line); line = reader.readLine(); } //linea final del organism entryStBuilder.append(line); //System.out.println("organismStBuilder.toString() = " + organismStBuilder.toString()); XMLElement entryXMLElem = new XMLElement(entryStBuilder.toString()); entryStBuilder.delete(0, entryStBuilder.length()); String modifiedDateSt = entryXMLElem.asJDomElement() .getAttributeValue(UniprotStuff.ENTRY_MODIFIED_DATE_ATTRIBUTE); String accessionSt = entryXMLElem.asJDomElement() .getChildText(UniprotStuff.ENTRY_ACCESSION_TAG_NAME); String nameSt = entryXMLElem.asJDomElement().getChildText(UniprotStuff.ENTRY_NAME_TAG_NAME); String fullNameSt = getProteinFullName( entryXMLElem.asJDomElement().getChild(UniprotStuff.PROTEIN_TAG_NAME)); String shortNameSt = getProteinShortName( entryXMLElem.asJDomElement().getChild(UniprotStuff.PROTEIN_TAG_NAME)); if (shortNameSt == null) { shortNameSt = ""; } if (fullNameSt == null) { fullNameSt = ""; } currentAccessionId = accessionSt; //-----------alternative accessions------------- ArrayList<String> alternativeAccessions = new ArrayList<>(); List<Element> altAccessionsList = entryXMLElem.asJDomElement() .getChildren(UniprotStuff.ENTRY_ACCESSION_TAG_NAME); for (int i = 1; i < altAccessionsList.size(); i++) { alternativeAccessions.add(altAccessionsList.get(i).getText()); } proteinProperties.put(ProteinNode.ALTERNATIVE_ACCESSIONS_PROPERTY, convertToStringArray(alternativeAccessions)); //-----db references------------- String pirIdSt = ""; String keggIdSt = ""; String ensemblIdSt = ""; String uniGeneIdSt = ""; String arrayExpressIdSt = ""; List<Element> dbReferenceList = entryXMLElem.asJDomElement() .getChildren(UniprotStuff.DB_REFERENCE_TAG_NAME); ArrayList<String> emblCrossReferences = new ArrayList<>(); ArrayList<String> refseqReferences = new ArrayList<>(); ArrayList<String> enzymeDBReferences = new ArrayList<>(); ArrayList<String> ensemblPlantsReferences = new ArrayList<>(); HashMap<String, String> reactomeReferences = new HashMap<>(); for (Element dbReferenceElem : dbReferenceList) { String refId = dbReferenceElem.getAttributeValue("id"); switch (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE)) { case "Ensembl": ensemblIdSt = refId; break; case "PIR": pirIdSt = refId; break; case "UniGene": uniGeneIdSt = refId; break; case "KEGG": keggIdSt = refId; break; case "EMBL": emblCrossReferences.add(refId); break; case "EC": enzymeDBReferences.add(refId); break; case "ArrayExpress": arrayExpressIdSt = refId; break; case "RefSeq": //refseqReferences.add(refId); List<Element> children = dbReferenceElem.getChildren("property"); for (Element propertyElem : children) { if (propertyElem.getAttributeValue("type").equals("nucleotide sequence ID")) { refseqReferences.add(propertyElem.getAttributeValue("value")); } } break; case "Reactome": Element propertyElem = dbReferenceElem.getChild("property"); String pathwayName = ""; if (propertyElem.getAttributeValue("type").equals("pathway name")) { pathwayName = propertyElem.getAttributeValue("value"); } reactomeReferences.put(refId, pathwayName); break; case "EnsemblPlants": ensemblPlantsReferences.add(refId); break; } } Element sequenceElem = entryXMLElem.asJDomElement() .getChild(UniprotStuff.ENTRY_SEQUENCE_TAG_NAME); String sequenceSt = sequenceElem.getText(); int seqLength = Integer .parseInt(sequenceElem.getAttributeValue(UniprotStuff.SEQUENCE_LENGTH_ATTRIBUTE)); float seqMass = Float .parseFloat(sequenceElem.getAttributeValue(UniprotStuff.SEQUENCE_MASS_ATTRIBUTE)); //System.out.println("lalala " + seqMass); proteinProperties.put(ProteinNode.MODIFIED_DATE_PROPERTY, modifiedDateSt); proteinProperties.put(ProteinNode.ACCESSION_PROPERTY, accessionSt); proteinProperties.put(ProteinNode.NAME_PROPERTY, nameSt); proteinProperties.put(ProteinNode.FULL_NAME_PROPERTY, fullNameSt); proteinProperties.put(ProteinNode.SHORT_NAME_PROPERTY, shortNameSt); proteinProperties.put(ProteinNode.SEQUENCE_PROPERTY, sequenceSt); proteinProperties.put(ProteinNode.LENGTH_PROPERTY, seqLength); proteinProperties.put(ProteinNode.MASS_PROPERTY, seqMass); proteinProperties.put(ProteinNode.ARRAY_EXPRESS_ID_PROPERTY, arrayExpressIdSt); proteinProperties.put(ProteinNode.PIR_ID_PROPERTY, pirIdSt); proteinProperties.put(ProteinNode.KEGG_ID_PROPERTY, keggIdSt); proteinProperties.put(ProteinNode.EMBL_REFERENCES_PROPERTY, convertToStringArray(emblCrossReferences)); proteinProperties.put(ProteinNode.ENSEMBL_PLANTS_REFERENCES_PROPERTY, convertToStringArray(ensemblPlantsReferences)); proteinProperties.put(ProteinNode.ENSEMBL_ID_PROPERTY, ensemblIdSt); proteinProperties.put(ProteinNode.UNIGENE_ID_PROPERTY, uniGeneIdSt); //---------------gene-names------------------- Element geneElement = entryXMLElem.asJDomElement().getChild(UniprotStuff.GENE_TAG_NAME); ArrayList<String> geneNames = new ArrayList<>(); if (geneElement != null) { List<Element> genesList = geneElement.getChildren(UniprotStuff.GENE_NAME_TAG_NAME); for (Element geneNameElem : genesList) { geneNames.add(geneNameElem.getText()); } } proteinProperties.put(ProteinNode.GENE_NAMES_PROPERTY, convertToStringArray(geneNames)); //----------------------------------------- long currentProteinId = inserter.createNode(proteinProperties); proteinAccessionIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_ACCESSION_INDEX, accessionSt)); //indexing protein by alternative accessions for (String altAccessionSt : alternativeAccessions) { proteinAccessionIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_ACCESSION_INDEX, altAccessionSt)); } //---flushing protein accession index---- proteinAccessionIndex.flush(); //---adding protein node to node_type index---- nodeTypeIndex.add(currentProteinId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ProteinNode.NODE_TYPE)); //indexing protein by full name if (!fullNameSt.isEmpty()) { proteinFullNameFullTextIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_FULL_NAME_FULL_TEXT_INDEX, fullNameSt)); //System.out.println(fullNameSt.toUpperCase() + " , " + currentProteinId); } //indexing protein by gene names String geneNamesStToBeIndexed = ""; for (String geneNameSt : geneNames) { geneNamesStToBeIndexed += geneNameSt + " "; } proteinGeneNamesFullTextIndex.add(currentProteinId, MapUtil .map(ProteinNode.PROTEIN_GENE_NAMES_FULL_TEXT_INDEX, geneNamesStToBeIndexed)); //indexing protein by Ensembl plants references for (String ensemblPlantRef : ensemblPlantsReferences) { proteinEnsemblPlantsIndex.add(currentProteinId, MapUtil.map(ProteinNode.PROTEIN_ENSEMBL_PLANTS_INDEX, ensemblPlantRef)); } //--------------refseq associations---------------- if (uniprotDataXML.getRefseq()) { for (String refseqReferenceSt : refseqReferences) { //System.out.println("refseqReferenceSt = " + refseqReferenceSt); IndexHits<Long> hits = genomeElementVersionIndex .get(GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX, refseqReferenceSt); if (hits.hasNext()) { inserter.createRelationship(currentProteinId, hits.getSingle(), proteinGenomeElementRel, null); } else { logger.log(Level.INFO, ("GenomeElem not found for: " + currentAccessionId + " , " + refseqReferenceSt)); } } } //--------------reactome associations---------------- if (uniprotDataXML.getReactome()) { for (String reactomeId : reactomeReferences.keySet()) { long reactomeTermNodeId = -1; IndexHits<Long> reactomeTermIdIndexHits = reactomeTermIdIndex .get(ReactomeTermNode.REACTOME_TERM_ID_INDEX, reactomeId); if (reactomeTermIdIndexHits.hasNext()) { reactomeTermNodeId = reactomeTermIdIndexHits.getSingle(); } if (reactomeTermNodeId < 0) { reactomeTermProperties.put(ReactomeTermNode.ID_PROPERTY, reactomeId); reactomeTermProperties.put(ReactomeTermNode.PATHWAY_NAME_PROPERTY, reactomeReferences.get(reactomeId)); reactomeTermNodeId = inserter.createNode(reactomeTermProperties); reactomeTermIdIndex.add(reactomeTermNodeId, MapUtil.map(ReactomeTermNode.REACTOME_TERM_ID_INDEX, reactomeId)); //----flushing reactome index--- reactomeTermIdIndex.flush(); //---adding reactome term node to node_type index---- nodeTypeIndex.add(reactomeTermNodeId, MapUtil .map(Bio4jManager.NODE_TYPE_INDEX_NAME, ReactomeTermNode.NODE_TYPE)); } inserter.createRelationship(currentProteinId, reactomeTermNodeId, proteinReactomeRel, null); } } //------------------------------------------------------- //---------------enzyme db associations---------------------- if (uniprotDataXML.getEnzymeDb()) { for (String enzymeDBRef : enzymeDBReferences) { long enzymeNodeId; IndexHits<Long> enzymeIdIndexHits = enzymeIdIndex.get(EnzymeNode.ENZYME_ID_INDEX, enzymeDBRef); if (enzymeIdIndexHits.hasNext()) { enzymeNodeId = enzymeIdIndexHits.next(); inserter.createRelationship(currentProteinId, enzymeNodeId, proteinEnzymaticActivityRel, null); } else { enzymeIdsNotFoundBuff.write( "Enzyme term: " + enzymeDBRef + " not found.\t" + currentAccessionId); } } } //------------------------------------------------------------ //-----comments import--- if (uniprotDataXML.getComments()) { importProteinComments(entryXMLElem, inserter, indexProvider, currentProteinId, sequenceSt, uniprotDataXML); } //-----features import---- if (uniprotDataXML.getFeatures()) { importProteinFeatures(entryXMLElem, inserter, indexProvider, currentProteinId); } //--------------------------------datasets-------------------------------------------------- String proteinDataSetSt = entryXMLElem.asJDomElement() .getAttributeValue(UniprotStuff.ENTRY_DATASET_ATTRIBUTE); //long datasetId = indexService.getSingleNode(DatasetNode.DATASET_NAME_INDEX, proteinDataSetSt); long datasetId = -1; IndexHits<Long> datasetNameIndexHits = datasetNameIndex.get(DatasetNode.DATASET_NAME_INDEX, proteinDataSetSt); if (datasetNameIndexHits.hasNext()) { datasetId = datasetNameIndexHits.getSingle(); } if (datasetId < 0) { datasetProperties.put(DatasetNode.NAME_PROPERTY, proteinDataSetSt); datasetId = inserter.createNode(datasetProperties); datasetNameIndex.add(datasetId, MapUtil.map(DatasetNode.DATASET_NAME_INDEX, proteinDataSetSt)); //----flushing dataset name index--- datasetNameIndex.flush(); //---adding dataset node to node_type index---- nodeTypeIndex.add(datasetId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, DatasetNode.NODE_TYPE)); } inserter.createRelationship(currentProteinId, datasetId, proteinDatasetRel, null); //--------------------------------------------------------------------------------------------- if (uniprotDataXML.getCitations()) { importProteinCitations(entryXMLElem, inserter, indexProvider, currentProteinId, uniprotDataXML); } //-------------------------------keywords------------------------------------------------------ if (uniprotDataXML.getKeywords()) { List<Element> keywordsList = entryXMLElem.asJDomElement() .getChildren(UniprotStuff.KEYWORD_TAG_NAME); for (Element keywordElem : keywordsList) { String keywordId = keywordElem.getAttributeValue(UniprotStuff.KEYWORD_ID_ATTRIBUTE); String keywordName = keywordElem.getText(); long keywordNodeId = -1; IndexHits<Long> keyworIdIndexHits = keywordIdIndex.get(KeywordNode.KEYWORD_ID_INDEX, keywordId); if (keyworIdIndexHits.hasNext()) { keywordNodeId = keyworIdIndexHits.getSingle(); } if (keywordNodeId < 0) { keywordProperties.put(KeywordNode.ID_PROPERTY, keywordId); keywordProperties.put(KeywordNode.NAME_PROPERTY, keywordName); keywordNodeId = inserter.createNode(keywordProperties); keywordIdIndex.add(keywordNodeId, MapUtil.map(KeywordNode.KEYWORD_ID_INDEX, keywordId)); keywordNameIndex.add(keywordNodeId, MapUtil.map(KeywordNode.KEYWORD_NAME_INDEX, keywordName)); //---flushing keyword id index---- keywordIdIndex.flush(); //---adding keyword node to node_type index---- nodeTypeIndex.add(keywordNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, KeywordNode.NODE_TYPE)); } inserter.createRelationship(currentProteinId, keywordNodeId, proteinKeywordRel, null); } } //--------------------------------------------------------------------------------------- for (Element dbReferenceElem : dbReferenceList) { //-------------------------------INTERPRO------------------------------------------------------ if (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE) .equals(UniprotStuff.INTERPRO_DB_REFERENCE_TYPE)) { if (uniprotDataXML.getInterpro()) { String interproId = dbReferenceElem .getAttributeValue(UniprotStuff.DB_REFERENCE_ID_ATTRIBUTE); //long interproNodeId = indexService.getSingleNode(InterproNode.INTERPRO_ID_INDEX, interproId); long interproNodeId = -1; IndexHits<Long> interproIdIndexHits = interproIdIndex .get(InterproNode.INTERPRO_ID_INDEX, interproId); if (interproIdIndexHits.hasNext()) { interproNodeId = interproIdIndexHits.getSingle(); } if (interproNodeId < 0) { String interproEntryNameSt = ""; List<Element> properties = dbReferenceElem .getChildren(UniprotStuff.DB_REFERENCE_PROPERTY_TAG_NAME); for (Element prop : properties) { if (prop.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE) .equals(UniprotStuff.INTERPRO_ENTRY_NAME)) { interproEntryNameSt = prop.getAttributeValue( UniprotStuff.DB_REFERENCE_VALUE_ATTRIBUTE); break; } } interproProperties.put(InterproNode.ID_PROPERTY, interproId); interproProperties.put(InterproNode.NAME_PROPERTY, interproEntryNameSt); interproNodeId = inserter.createNode(interproProperties); interproIdIndex.add(interproNodeId, MapUtil.map(InterproNode.INTERPRO_ID_INDEX, interproId)); //flushing interpro id index interproIdIndex.flush(); //---adding interpro node to node_type index---- nodeTypeIndex.add(interproNodeId, MapUtil .map(Bio4jManager.NODE_TYPE_INDEX_NAME, InterproNode.NODE_TYPE)); } inserter.createRelationship(currentProteinId, interproNodeId, proteinInterproRel, null); } } //-------------------------------PFAM------------------------------------------------------ else if (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE) .equals("Pfam")) { if (uniprotDataXML.getPfam()) { String pfamId = dbReferenceElem .getAttributeValue(UniprotStuff.DB_REFERENCE_ID_ATTRIBUTE); long pfamNodeId = -1; IndexHits<Long> pfamIdIndexHits = pfamIdIndex.get(PfamNode.PFAM_ID_INDEX, pfamId); if (pfamIdIndexHits.hasNext()) { pfamNodeId = pfamIdIndexHits.getSingle(); } if (pfamNodeId < 0) { String pfamEntryNameSt = ""; List<Element> properties = dbReferenceElem .getChildren(UniprotStuff.DB_REFERENCE_PROPERTY_TAG_NAME); for (Element prop : properties) { if (prop.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE) .equals("entry name")) { pfamEntryNameSt = prop.getAttributeValue( UniprotStuff.DB_REFERENCE_VALUE_ATTRIBUTE); break; } } pfamProperties.put(PfamNode.ID_PROPERTY, pfamId); pfamProperties.put(PfamNode.NAME_PROPERTY, pfamEntryNameSt); pfamNodeId = inserter.createNode(pfamProperties); pfamIdIndex.add(pfamNodeId, MapUtil.map(PfamNode.PFAM_ID_INDEX, pfamId)); //flushing pfam id index pfamIdIndex.flush(); //---adding pfam node to node_type index---- nodeTypeIndex.add(pfamNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PfamNode.NODE_TYPE)); } inserter.createRelationship(currentProteinId, pfamNodeId, proteinPfamRel, null); } } //-------------------GO ----------------------------- else if (dbReferenceElem.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE) .toUpperCase().equals(UniprotStuff.GO_DB_REFERENCE_TYPE)) { if (uniprotDataXML.getGeneOntology()) { String goId = dbReferenceElem .getAttributeValue(UniprotStuff.DB_REFERENCE_ID_ATTRIBUTE); String evidenceSt = ""; List<Element> props = dbReferenceElem .getChildren(UniprotStuff.DB_REFERENCE_PROPERTY_TAG_NAME); for (Element element : props) { if (element.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE) .equals(UniprotStuff.EVIDENCE_TYPE_ATTRIBUTE)) { evidenceSt = element.getAttributeValue("value"); if (evidenceSt == null) { evidenceSt = ""; } break; } } long goTermNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, goId) .getSingle(); proteinGoProperties.put(ProteinGoRel.EVIDENCE_PROPERTY, evidenceSt); inserter.createRelationship(currentProteinId, goTermNodeId, proteinGoRel, proteinGoProperties); } } } //--------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------- //--------------------------------organism----------------------------------------------- String scName, commName, synName; scName = ""; commName = ""; synName = ""; Element organismElem = entryXMLElem.asJDomElement() .getChild(UniprotStuff.ORGANISM_TAG_NAME); List<Element> organismNames = organismElem.getChildren(UniprotStuff.ORGANISM_NAME_TAG_NAME); for (Element element : organismNames) { String type = element.getAttributeValue(UniprotStuff.ORGANISM_NAME_TYPE_ATTRIBUTE); switch (type) { case UniprotStuff.ORGANISM_SCIENTIFIC_NAME_TYPE: scName = element.getText(); break; case UniprotStuff.ORGANISM_COMMON_NAME_TYPE: commName = element.getText(); break; case UniprotStuff.ORGANISM_SYNONYM_NAME_TYPE: synName = element.getText(); break; } } //long organismNodeId = indexService.getSingleNode(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, scName); long organismNodeId = -1; IndexHits<Long> organismScientifiNameIndexHits = organismScientificNameIndex .get(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, scName); if (organismScientifiNameIndexHits.hasNext()) { organismNodeId = organismScientifiNameIndexHits.getSingle(); } if (organismNodeId < 0) { organismProperties.put(OrganismNode.COMMON_NAME_PROPERTY, commName); organismProperties.put(OrganismNode.SCIENTIFIC_NAME_PROPERTY, scName); organismProperties.put(OrganismNode.SYNONYM_NAME_PROPERTY, synName); List<Element> organismDbRefElems = organismElem .getChildren(UniprotStuff.DB_REFERENCE_TAG_NAME); boolean ncbiIdFound = false; if (organismDbRefElems != null) { for (Element dbRefElem : organismDbRefElems) { String t = dbRefElem.getAttributeValue("type"); if (t.equals("NCBI Taxonomy")) { organismProperties.put(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY, dbRefElem.getAttributeValue("id")); ncbiIdFound = true; break; } } } if (!ncbiIdFound) { organismProperties.put(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY, ""); } organismNodeId = inserter.createNode(organismProperties); organismScientificNameIndex.add(organismNodeId, MapUtil.map(OrganismNode.ORGANISM_SCIENTIFIC_NAME_INDEX, scName)); organismNcbiTaxonomyIdIndex.add(organismNodeId, MapUtil.map(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY, organismProperties.get(OrganismNode.NCBI_TAXONOMY_ID_PROPERTY))); //flushing organism scientifica name index organismScientificNameIndex.flush(); //---adding organism node to node_type index---- nodeTypeIndex.add(organismNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, OrganismNode.NODE_TYPE)); Element lineage = entryXMLElem.asJDomElement().getChild("organism").getChild("lineage"); List<Element> taxons = lineage.getChildren("taxon"); Element firstTaxonElem = taxons.get(0); //long firstTaxonId = indexService.getSingleNode(TaxonNode.TAXON_NAME_INDEX, firstTaxonElem.getText()); long firstTaxonId = -1; IndexHits<Long> firstTaxonIndexHits = taxonNameIndex.get(TaxonNode.TAXON_NAME_INDEX, firstTaxonElem.getText()); if (firstTaxonIndexHits.hasNext()) { firstTaxonId = firstTaxonIndexHits.getSingle(); } if (firstTaxonId < 0) { String firstTaxonName = firstTaxonElem.getText(); taxonProperties.put(TaxonNode.NAME_PROPERTY, firstTaxonName); firstTaxonId = createTaxonNode(taxonProperties, inserter, taxonNameIndex, nodeTypeIndex); //flushing taxon name index-- taxonNameIndex.flush(); } long lastTaxonId = firstTaxonId; for (int i = 1; i < taxons.size(); i++) { String taxonName = taxons.get(i).getText(); long currentTaxonId = -1; IndexHits<Long> currentTaxonIndexHits = taxonNameIndex .get(TaxonNode.TAXON_NAME_INDEX, taxonName); if (currentTaxonIndexHits.hasNext()) { currentTaxonId = currentTaxonIndexHits.getSingle(); } if (currentTaxonId < 0) { taxonProperties.put(TaxonNode.NAME_PROPERTY, taxonName); currentTaxonId = createTaxonNode(taxonProperties, inserter, taxonNameIndex, nodeTypeIndex); //flushing taxon name index-- taxonNameIndex.flush(); inserter.createRelationship(lastTaxonId, currentTaxonId, taxonParentRel, null); } lastTaxonId = currentTaxonId; } inserter.createRelationship(lastTaxonId, organismNodeId, taxonParentRel, null); } //--------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------- inserter.createRelationship(currentProteinId, organismNodeId, proteinOrganismRel, null); proteinCounter++; if ((proteinCounter % limitForPrintingOut) == 0) { String countProteinsSt = proteinCounter + " proteins inserted!!"; logger.log(Level.INFO, countProteinsSt); } } } } catch (Exception e) { logger.log(Level.SEVERE, ("Exception retrieving protein " + currentAccessionId)); logger.log(Level.SEVERE, e.getMessage()); StackTraceElement[] trace = e.getStackTrace(); for (StackTraceElement stackTraceElement : trace) { logger.log(Level.SEVERE, stackTraceElement.toString()); } } finally { try { //------closing writers------- enzymeIdsNotFoundBuff.close(); // shutdown, makes sure all changes are written to disk indexProvider.shutdown(); inserter.shutdown(); // closing logger file handler fh.close(); //-----------------writing stats file--------------------- long elapsedTime = System.nanoTime() - initTime; long elapsedSeconds = Math.round((elapsedTime / 1000000000.0)); long hours = elapsedSeconds / 3600; long minutes = (elapsedSeconds % 3600) / 60; long seconds = (elapsedSeconds % 3600) % 60; statsBuff.write("Statistics for program ImportUniprot:\nInput file: " + inFile.getName() + "\nThere were " + proteinCounter + " proteins inserted.\n" + "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n"); //---closing stats writer--- statsBuff.close(); } catch (IOException ex) { Logger.getLogger(ImportUniprot.class.getName()).log(Level.SEVERE, null, ex); } } } }
From source file:com.bio4j.neo4jdb.programs.ImportUniprot.java
License:Open Source License
private static void importProteinFeatures(XMLElement entryXMLElem, BatchInserter inserter, BatchInserterIndexProvider indexProvider, long currentProteinId) { //-----------------create batch indexes---------------------------------- //---------------------------------------------------------------------- BatchInserterIndex featureTypeNameIndex = indexProvider.nodeIndex(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); //------------------------------------------------------------------------ //--------------------------------features---------------------------------------------------- List<Element> featuresList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.FEATURE_TAG_NAME); for (Element featureElem : featuresList) { String featureTypeSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_TYPE_ATTRIBUTE); //long featureTypeNodeId = indexService.getSingleNode(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt); long featureTypeNodeId = -1; IndexHits<Long> featureTypeNameIndexHits = featureTypeNameIndex .get(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt); if (featureTypeNameIndexHits.hasNext()) { featureTypeNodeId = featureTypeNameIndexHits.getSingle(); }/* www . j a va 2 s.c om*/ featureTypeNameIndexHits.close(); if (featureTypeNodeId < 0) { featureTypeProperties.put(FeatureTypeNode.NAME_PROPERTY, featureTypeSt); featureTypeNodeId = inserter.createNode(featureTypeProperties); //indexService.index(featureTypeNodeId, FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt); featureTypeNameIndex.add(featureTypeNodeId, MapUtil.map(FeatureTypeNode.FEATURE_TYPE_NAME_INDEX, featureTypeSt)); //---flushing feature type name index---- featureTypeNameIndex.flush(); //---adding feature type node to node_type index---- nodeTypeIndex.add(featureTypeNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, FeatureTypeNode.NODE_TYPE)); } String featureDescSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_DESCRIPTION_ATTRIBUTE); if (featureDescSt == null) { featureDescSt = ""; } String featureIdSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_ID_ATTRIBUTE); if (featureIdSt == null) { featureIdSt = ""; } String featureStatusSt = featureElem.getAttributeValue(UniprotStuff.STATUS_ATTRIBUTE); if (featureStatusSt == null) { featureStatusSt = ""; } String featureEvidenceSt = featureElem.getAttributeValue(UniprotStuff.EVIDENCE_ATTRIBUTE); if (featureEvidenceSt == null) { featureEvidenceSt = ""; } Element locationElem = featureElem.getChild(UniprotStuff.FEATURE_LOCATION_TAG_NAME); Element positionElem = locationElem.getChild(UniprotStuff.FEATURE_POSITION_TAG_NAME); String beginFeatureSt; String endFeatureSt; if (positionElem != null) { beginFeatureSt = positionElem.getAttributeValue(UniprotStuff.FEATURE_POSITION_POSITION_ATTRIBUTE); endFeatureSt = beginFeatureSt; } else { beginFeatureSt = locationElem.getChild(UniprotStuff.FEATURE_LOCATION_BEGIN_TAG_NAME) .getAttributeValue(UniprotStuff.FEATURE_LOCATION_POSITION_ATTRIBUTE); endFeatureSt = locationElem.getChild(UniprotStuff.FEATURE_LOCATION_END_TAG_NAME) .getAttributeValue(UniprotStuff.FEATURE_LOCATION_POSITION_ATTRIBUTE); } if (beginFeatureSt == null) { beginFeatureSt = ""; } if (endFeatureSt == null) { endFeatureSt = ""; } String originalSt = featureElem.getChildText(UniprotStuff.FEATURE_ORIGINAL_TAG_NAME); String variationSt = featureElem.getChildText(UniprotStuff.FEATURE_VARIATION_TAG_NAME); if (originalSt == null) { originalSt = ""; } if (variationSt == null) { variationSt = ""; } String featureRefSt = featureElem.getAttributeValue(UniprotStuff.FEATURE_REF_ATTRIBUTE); if (featureRefSt == null) { featureRefSt = ""; } featureProperties.put(BasicFeatureRel.DESCRIPTION_PROPERTY, featureDescSt); featureProperties.put(BasicFeatureRel.ID_PROPERTY, featureIdSt); featureProperties.put(BasicFeatureRel.EVIDENCE_PROPERTY, featureEvidenceSt); featureProperties.put(BasicFeatureRel.STATUS_PROPERTY, featureStatusSt); featureProperties.put(BasicFeatureRel.BEGIN_PROPERTY, beginFeatureSt); featureProperties.put(BasicFeatureRel.END_PROPERTY, endFeatureSt); featureProperties.put(BasicFeatureRel.ORIGINAL_PROPERTY, originalSt); featureProperties.put(BasicFeatureRel.VARIATION_PROPERTY, variationSt); featureProperties.put(BasicFeatureRel.REF_PROPERTY, featureRefSt); switch (featureTypeSt) { case ActiveSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, activeSiteFeatureRel, featureProperties); break; case BindingSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, bindingSiteFeatureRel, featureProperties); break; case CrossLinkFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, crossLinkFeatureRel, featureProperties); break; case GlycosylationSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, glycosylationSiteFeatureRel, featureProperties); break; case InitiatorMethionineFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, initiatorMethionineFeatureRel, featureProperties); break; case LipidMoietyBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, lipidMoietyBindingRegionFeatureRel, featureProperties); break; case MetalIonBindingSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, metalIonBindingSiteFeatureRel, featureProperties); break; case ModifiedResidueFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, modifiedResidueFeatureRel, featureProperties); break; case NonStandardAminoAcidFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, nonStandardAminoAcidFeatureRel, featureProperties); break; case NonTerminalResidueFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, nonTerminalResidueFeatureRel, featureProperties); break; case PeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, peptideFeatureRel, featureProperties); break; case UnsureResidueFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, unsureResidueFeatureRel, featureProperties); break; case MutagenesisSiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, mutagenesisSiteFeatureRel, featureProperties); break; case SequenceVariantFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, sequenceVariantFeatureRel, featureProperties); break; case CalciumBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, calciumBindingRegionFeatureRel, featureProperties); break; case ChainFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, chainFeatureRel, featureProperties); break; case CoiledCoilRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, coiledCoilRegionFeatureRel, featureProperties); break; case CompositionallyBiasedRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, compositionallyBiasedRegionFeatureRel, featureProperties); break; case DisulfideBondFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, disulfideBondFeatureRel, featureProperties); break; case DnaBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, dnaBindingRegionFeatureRel, featureProperties); break; case DomainFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, domainFeatureRel, featureProperties); break; case HelixFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, helixFeatureRel, featureProperties); break; case IntramembraneRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, intramembraneRegionFeatureRel, featureProperties); break; case NonConsecutiveResiduesFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, nonConsecutiveResiduesFeatureRel, featureProperties); break; case NucleotidePhosphateBindingRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, nucleotidePhosphateBindingRegionFeatureRel, featureProperties); break; case PropeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, propeptideFeatureRel, featureProperties); break; case RegionOfInterestFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, regionOfInterestFeatureRel, featureProperties); break; case RepeatFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, repeatFeatureRel, featureProperties); break; case ShortSequenceMotifFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, shortSequenceMotifFeatureRel, featureProperties); break; case SignalPeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, signalPeptideFeatureRel, featureProperties); break; case SpliceVariantFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, spliceVariantFeatureRel, featureProperties); break; case StrandFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, strandFeatureRel, featureProperties); break; case TopologicalDomainFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, topologicalDomainFeatureRel, featureProperties); break; case TransitPeptideFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, transitPeptideFeatureRel, featureProperties); break; case TransmembraneRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, transmembraneRegionFeatureRel, featureProperties); break; case ZincFingerRegionFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, zincFingerRegionFeatureRel, featureProperties); break; case SiteFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, siteFeatureRel, featureProperties); break; case TurnFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, turnFeatureRel, featureProperties); break; case SequenceConflictFeatureRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, featureTypeNodeId, sequenceConflictFeatureRel, featureProperties); break; } } }
From source file:com.bio4j.neo4jdb.programs.ImportUniprot.java
License:Open Source License
private static void importProteinComments(XMLElement entryXMLElem, BatchInserter inserter, BatchInserterIndexProvider indexProvider, long currentProteinId, String proteinSequence, UniprotDataXML uniprotDataXML) { //---------------indexes declaration--------------------------- BatchInserterIndex commentTypeNameIndex = indexProvider.nodeIndex(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex subcellularLocationNameIndex = indexProvider.nodeIndex( SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex isoformIdIndex = indexProvider.nodeIndex(IsoformNode.ISOFORM_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); //----------------------------------------------------------- List<Element> comments = entryXMLElem.asJDomElement().getChildren(UniprotStuff.COMMENT_TAG_NAME); for (Element commentElem : comments) { String commentTypeSt = commentElem.getAttributeValue(UniprotStuff.COMMENT_TYPE_ATTRIBUTE); Element textElem = commentElem.getChild("text"); String commentTextSt = ""; String commentStatusSt = ""; String commentEvidenceSt = ""; if (textElem != null) { commentTextSt = textElem.getText(); commentStatusSt = textElem.getAttributeValue("status"); if (commentStatusSt == null) { commentStatusSt = ""; }/*from w w w . jav a 2s . co m*/ commentEvidenceSt = textElem.getAttributeValue("evidence"); if (commentEvidenceSt == null) { commentEvidenceSt = ""; } } commentProperties.put(BasicCommentRel.TEXT_PROPERTY, commentTextSt); commentProperties.put(BasicCommentRel.STATUS_PROPERTY, commentStatusSt); commentProperties.put(BasicCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt); //-----------------COMMENT TYPE NODE RETRIEVING/CREATION---------------------- //long commentTypeId = indexService.getSingleNode(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, commentTypeSt); IndexHits<Long> commentTypeNameIndexHits = commentTypeNameIndex .get(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, commentTypeSt); long commentTypeId = -1; if (commentTypeNameIndexHits.hasNext()) { commentTypeId = commentTypeNameIndexHits.getSingle(); } commentTypeNameIndexHits.close(); if (commentTypeId < 0) { commentTypeProperties.put(CommentTypeNode.NAME_PROPERTY, commentTypeSt); commentTypeId = inserter.createNode(commentTypeProperties); commentTypeNameIndex.add(commentTypeId, MapUtil.map(CommentTypeNode.COMMENT_TYPE_NAME_INDEX, commentTypeSt)); //----flushing the indexation---- commentTypeNameIndex.flush(); //---adding comment type node to node_type index---- nodeTypeIndex.add(commentTypeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, CommentTypeNode.NODE_TYPE)); } //-----toxic dose---------------- switch (commentTypeSt) { case ToxicDoseCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, toxicDoseCommentRel, commentProperties); break; case CautionCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, cautionCommentRel, commentProperties); break; case CofactorCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, cofactorCommentRel, commentProperties); break; case DiseaseCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, diseaseCommentRel, commentProperties); break; case OnlineInformationCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: onlineInformationCommentProperties.put(OnlineInformationCommentRel.STATUS_PROPERTY, commentStatusSt); onlineInformationCommentProperties.put(OnlineInformationCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt); onlineInformationCommentProperties.put(OnlineInformationCommentRel.TEXT_PROPERTY, commentTextSt); String nameSt = commentElem.getAttributeValue("name"); if (nameSt == null) { nameSt = ""; } String linkSt = ""; Element linkElem = commentElem.getChild("link"); if (linkElem != null) { String uriSt = linkElem.getAttributeValue("uri"); if (uriSt != null) { linkSt = uriSt; } } onlineInformationCommentProperties.put(OnlineInformationCommentRel.NAME_PROPERTY, nameSt); onlineInformationCommentProperties.put(OnlineInformationCommentRel.LINK_PROPERTY, linkSt); inserter.createRelationship(currentProteinId, commentTypeId, onlineInformationCommentRel, onlineInformationCommentProperties); break; case TissueSpecificityCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, tissueSpecificityCommentRel, commentProperties); break; case FunctionCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, functionCommentRel, commentProperties); break; case BiotechnologyCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, biotechnologyCommentRel, commentProperties); break; case SubunitCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, subunitCommentRel, commentProperties); break; case PolymorphismCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, polymorphismCommentRel, commentProperties); break; case DomainCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, domainCommentRel, commentProperties); break; case PostTranslationalModificationCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, postTranslationalModificationCommentRel, commentProperties); break; case CatalyticActivityCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, catalyticActivityCommentRel, commentProperties); break; case DisruptionPhenotypeCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, disruptionPhenotypeCommentRel, commentProperties); break; case BioPhysicoChemicalPropertiesCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.STATUS_PROPERTY, commentStatusSt); biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt); biophysicochemicalCommentProperties.put(BioPhysicoChemicalPropertiesCommentRel.TEXT_PROPERTY, commentTextSt); String phDependenceSt = commentElem.getChildText("phDependence"); String temperatureDependenceSt = commentElem.getChildText("temperatureDependence"); if (phDependenceSt == null) { phDependenceSt = ""; } if (temperatureDependenceSt == null) { temperatureDependenceSt = ""; } String absorptionMaxSt = ""; String absorptionTextSt = ""; Element absorptionElem = commentElem.getChild("absorption"); if (absorptionElem != null) { absorptionMaxSt = absorptionElem.getChildText("max"); absorptionTextSt = absorptionElem.getChildText("text"); if (absorptionMaxSt == null) { absorptionMaxSt = ""; } if (absorptionTextSt == null) { absorptionTextSt = ""; } } String kineticsSt = ""; Element kineticsElem = commentElem.getChild("kinetics"); if (kineticsElem != null) { kineticsSt = new XMLElement(kineticsElem).toString(); } String redoxPotentialSt = ""; String redoxPotentialEvidenceSt = ""; Element redoxPotentialElem = commentElem.getChild("redoxPotential"); if (redoxPotentialElem != null) { redoxPotentialSt = redoxPotentialElem.getText(); redoxPotentialEvidenceSt = redoxPotentialElem.getAttributeValue("evidence"); if (redoxPotentialSt == null) { redoxPotentialSt = ""; } if (redoxPotentialEvidenceSt == null) { redoxPotentialEvidenceSt = ""; } } biophysicochemicalCommentProperties.put( BioPhysicoChemicalPropertiesCommentRel.TEMPERATURE_DEPENDENCE_PROPERTY, temperatureDependenceSt); biophysicochemicalCommentProperties .put(BioPhysicoChemicalPropertiesCommentRel.PH_DEPENDENCE_PROPERTY, phDependenceSt); biophysicochemicalCommentProperties .put(BioPhysicoChemicalPropertiesCommentRel.KINETICS_XML_PROPERTY, kineticsSt); biophysicochemicalCommentProperties .put(BioPhysicoChemicalPropertiesCommentRel.ABSORPTION_MAX_PROPERTY, absorptionMaxSt); biophysicochemicalCommentProperties .put(BioPhysicoChemicalPropertiesCommentRel.ABSORPTION_TEXT_PROPERTY, absorptionTextSt); biophysicochemicalCommentProperties.put( BioPhysicoChemicalPropertiesCommentRel.REDOX_POTENTIAL_EVIDENCE_PROPERTY, redoxPotentialEvidenceSt); biophysicochemicalCommentProperties .put(BioPhysicoChemicalPropertiesCommentRel.REDOX_POTENTIAL_PROPERTY, redoxPotentialSt); inserter.createRelationship(currentProteinId, commentTypeId, bioPhysicoChemicalPropertiesCommentRel, biophysicochemicalCommentProperties); break; case AllergenCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, allergenCommentRel, commentProperties); break; case PathwayCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, pathwayCommentRel, commentProperties); break; case InductionCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, inductionCommentRel, commentProperties); break; case ProteinSubcellularLocationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getSubcellularLocations()) { List<Element> subcLocations = commentElem .getChildren(UniprotStuff.SUBCELLULAR_LOCATION_TAG_NAME); for (Element subcLocation : subcLocations) { List<Element> locations = subcLocation.getChildren(UniprotStuff.LOCATION_TAG_NAME); Element firstLocation = locations.get(0); //long firstLocationId = indexService.getSingleNode(SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, firstLocation.getTextTrim()); long firstLocationId = -1; IndexHits<Long> firstLocationIndexHits = subcellularLocationNameIndex.get( SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, firstLocation.getTextTrim()); if (firstLocationIndexHits.hasNext()) { firstLocationId = firstLocationIndexHits.getSingle(); } firstLocationIndexHits.close(); long lastLocationId = firstLocationId; if (firstLocationId < 0) { subcellularLocationProperties.put(SubcellularLocationNode.NAME_PROPERTY, firstLocation.getTextTrim()); lastLocationId = createSubcellularLocationNode(subcellularLocationProperties, inserter, subcellularLocationNameIndex, nodeTypeIndex); //---flushing subcellular location name index--- subcellularLocationNameIndex.flush(); } for (int i = 1; i < locations.size(); i++) { long tempLocationId; IndexHits<Long> tempLocationIndexHits = subcellularLocationNameIndex.get( SubcellularLocationNode.SUBCELLULAR_LOCATION_NAME_INDEX, locations.get(i).getTextTrim()); if (tempLocationIndexHits.hasNext()) { tempLocationId = tempLocationIndexHits.getSingle(); tempLocationIndexHits.close(); } else { subcellularLocationProperties.put(SubcellularLocationNode.NAME_PROPERTY, locations.get(i).getTextTrim()); tempLocationId = createSubcellularLocationNode(subcellularLocationProperties, inserter, subcellularLocationNameIndex, nodeTypeIndex); subcellularLocationNameIndex.flush(); } inserter.createRelationship(tempLocationId, lastLocationId, subcellularLocationParentRel, null); lastLocationId = tempLocationId; } Element lastLocation = locations.get(locations.size() - 1); String evidenceSt = lastLocation.getAttributeValue(UniprotStuff.EVIDENCE_ATTRIBUTE); String statusSt = lastLocation.getAttributeValue(UniprotStuff.STATUS_ATTRIBUTE); String topologyStatusSt = ""; String topologySt = ""; Element topologyElem = subcLocation.getChild("topology"); if (topologyElem != null) { topologySt = topologyElem.getText(); topologyStatusSt = topologyElem.getAttributeValue("status"); } if (topologyStatusSt == null) { topologyStatusSt = ""; } if (topologySt == null) { topologySt = ""; } if (evidenceSt == null) { evidenceSt = ""; } if (statusSt == null) { statusSt = ""; } proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.EVIDENCE_PROPERTY, evidenceSt); proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.STATUS_PROPERTY, statusSt); proteinSubcellularLocationProperties.put(ProteinSubcellularLocationRel.TOPOLOGY_PROPERTY, topologySt); proteinSubcellularLocationProperties .put(ProteinSubcellularLocationRel.TOPOLOGY_STATUS_PROPERTY, topologyStatusSt); inserter.createRelationship(currentProteinId, lastLocationId, proteinSubcellularLocationRel, proteinSubcellularLocationProperties); } } break; case UniprotStuff.COMMENT_ALTERNATIVE_PRODUCTS_TYPE: if (uniprotDataXML.getIsoforms()) { List<Element> eventList = commentElem.getChildren("event"); List<Element> isoformList = commentElem.getChildren("isoform"); for (Element isoformElem : isoformList) { String isoformIdSt = isoformElem.getChildText("id"); String isoformNoteSt = isoformElem.getChildText("note"); String isoformNameSt = isoformElem.getChildText("name"); String isoformSeqSt = ""; Element isoSeqElem = isoformElem.getChild("sequence"); if (isoSeqElem != null) { String isoSeqTypeSt = isoSeqElem.getAttributeValue("type"); if (isoSeqTypeSt.equals("displayed")) { isoformSeqSt = proteinSequence; } } if (isoformNoteSt == null) { isoformNoteSt = ""; } if (isoformNameSt == null) { isoformNameSt = ""; } isoformProperties.put(IsoformNode.ID_PROPERTY, isoformIdSt); isoformProperties.put(IsoformNode.NOTE_PROPERTY, isoformNoteSt); isoformProperties.put(IsoformNode.NAME_PROPERTY, isoformNameSt); isoformProperties.put(IsoformNode.SEQUENCE_PROPERTY, isoformSeqSt); //-------------------------------------------------------- //long isoformId = indexService.getSingleNode(IsoformNode.ISOFORM_ID_INDEX, isoformIdSt); long isoformId = -1; IndexHits<Long> isoformIdIndexHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, isoformIdSt); if (isoformIdIndexHits.hasNext()) { isoformId = isoformIdIndexHits.getSingle(); } isoformIdIndexHits.close(); if (isoformId < 0) { isoformId = createIsoformNode(isoformProperties, inserter, isoformIdIndex, nodeTypeIndex); } for (Element eventElem : eventList) { String eventTypeSt = eventElem.getAttributeValue("type"); switch (eventTypeSt) { case AlternativeProductInitiationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(isoformId, alternativeProductInitiationId, isoformEventGeneratorRel, null); break; case AlternativeProductPromoterRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(isoformId, alternativeProductPromoterId, isoformEventGeneratorRel, null); break; case AlternativeProductRibosomalFrameshiftingRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(isoformId, alternativeProductRibosomalFrameshiftingId, isoformEventGeneratorRel, null); break; case AlternativeProductSplicingRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(isoformId, alternativeProductSplicingId, isoformEventGeneratorRel, null); break; } } //protein isoform relationship inserter.createRelationship(currentProteinId, isoformId, proteinIsoformRel, null); } } break; case UniprotStuff.COMMENT_SEQUENCE_CAUTION_TYPE: sequenceCautionProperties.put(BasicProteinSequenceCautionRel.EVIDENCE_PROPERTY, commentEvidenceSt); sequenceCautionProperties.put(BasicProteinSequenceCautionRel.STATUS_PROPERTY, commentStatusSt); sequenceCautionProperties.put(BasicProteinSequenceCautionRel.TEXT_PROPERTY, commentTextSt); Element conflictElem = commentElem.getChild("conflict"); if (conflictElem != null) { String conflictTypeSt = conflictElem.getAttributeValue("type"); String resourceSt = ""; String idSt = ""; String versionSt = ""; ArrayList<String> positionsList = new ArrayList<>(); Element sequenceElem = conflictElem.getChild("sequence"); if (sequenceElem != null) { resourceSt = sequenceElem.getAttributeValue("resource"); if (resourceSt == null) { resourceSt = ""; } idSt = sequenceElem.getAttributeValue("id"); if (idSt == null) { idSt = ""; } versionSt = sequenceElem.getAttributeValue("version"); if (versionSt == null) { versionSt = ""; } } Element locationElem = commentElem.getChild("location"); if (locationElem != null) { Element positionElem = locationElem.getChild("position"); if (positionElem != null) { String tempPos = positionElem.getAttributeValue("position"); if (tempPos != null) { positionsList.add(tempPos); } } } sequenceCautionProperties.put(BasicProteinSequenceCautionRel.RESOURCE_PROPERTY, resourceSt); sequenceCautionProperties.put(BasicProteinSequenceCautionRel.ID_PROPERTY, idSt); sequenceCautionProperties.put(BasicProteinSequenceCautionRel.VERSION_PROPERTY, versionSt); switch (conflictTypeSt) { case ProteinErroneousGeneModelPredictionRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (positionsList.size() > 0) { for (String tempPosition : positionsList) { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition); inserter.createRelationship(currentProteinId, seqCautionErroneousGeneModelPredictionId, proteinErroneousGeneModelPredictionRel, sequenceCautionProperties); } } else { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, ""); inserter.createRelationship(currentProteinId, seqCautionErroneousGeneModelPredictionId, proteinErroneousGeneModelPredictionRel, sequenceCautionProperties); } break; case ProteinErroneousInitiationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (positionsList.size() > 0) { for (String tempPosition : positionsList) { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition); inserter.createRelationship(currentProteinId, seqCautionErroneousInitiationId, proteinErroneousInitiationRel, sequenceCautionProperties); } } else { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, ""); inserter.createRelationship(currentProteinId, seqCautionErroneousInitiationId, proteinErroneousInitiationRel, sequenceCautionProperties); } break; case ProteinErroneousTranslationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (positionsList.size() > 0) { for (String tempPosition : positionsList) { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition); inserter.createRelationship(currentProteinId, seqCautionErroneousTranslationId, proteinErroneousTranslationRel, sequenceCautionProperties); } } else { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, ""); inserter.createRelationship(currentProteinId, seqCautionErroneousTranslationId, proteinErroneousTranslationRel, sequenceCautionProperties); } break; case ProteinErroneousTerminationRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (positionsList.size() > 0) { for (String tempPosition : positionsList) { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition); inserter.createRelationship(currentProteinId, seqCautionErroneousTerminationId, proteinErroneousTerminationRel, sequenceCautionProperties); } } else { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, ""); inserter.createRelationship(currentProteinId, seqCautionErroneousTerminationId, proteinErroneousTerminationRel, sequenceCautionProperties); } break; case ProteinFrameshiftRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (positionsList.size() > 0) { for (String tempPosition : positionsList) { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition); inserter.createRelationship(currentProteinId, seqCautionFrameshiftId, proteinFrameshiftRel, sequenceCautionProperties); } } else { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, ""); inserter.createRelationship(currentProteinId, seqCautionFrameshiftId, proteinFrameshiftRel, sequenceCautionProperties); } break; case ProteinMiscellaneousDiscrepancyRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (positionsList.size() > 0) { for (String tempPosition : positionsList) { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, tempPosition); inserter.createRelationship(currentProteinId, seqCautionMiscellaneousDiscrepancyId, proteinMiscellaneousDiscrepancyRel, sequenceCautionProperties); } } else { sequenceCautionProperties.put(BasicProteinSequenceCautionRel.POSITION_PROPERTY, ""); inserter.createRelationship(currentProteinId, seqCautionMiscellaneousDiscrepancyId, proteinMiscellaneousDiscrepancyRel, sequenceCautionProperties); } break; } } break; case DevelopmentalStageCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, developmentalStageCommentRel, commentProperties); break; case MiscellaneousCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, miscellaneousCommentRel, commentProperties); break; case SimilarityCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, similarityCommentRel, commentProperties); break; case RnaEditingCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: rnaEditingCommentProperties.put(RnaEditingCommentRel.STATUS_PROPERTY, commentStatusSt); rnaEditingCommentProperties.put(RnaEditingCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt); rnaEditingCommentProperties.put(RnaEditingCommentRel.TEXT_PROPERTY, commentTextSt); List<Element> locationsList = commentElem.getChildren("location"); for (Element tempLoc : locationsList) { String positionSt = tempLoc.getChild("position").getAttributeValue("position"); rnaEditingCommentProperties.put(RnaEditingCommentRel.POSITION_PROPERTY, positionSt); inserter.createRelationship(currentProteinId, commentTypeId, rnaEditingCommentRel, rnaEditingCommentProperties); } break; case PharmaceuticalCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, pharmaceuticalCommentRel, commentProperties); break; case EnzymeRegulationCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: inserter.createRelationship(currentProteinId, commentTypeId, enzymeRegulationCommentRel, commentProperties); break; case MassSpectrometryCommentRel.UNIPROT_ATTRIBUTE_TYPE_VALUE: String methodSt = commentElem.getAttributeValue("method"); String massSt = commentElem.getAttributeValue("mass"); if (methodSt == null) { methodSt = ""; } if (massSt == null) { massSt = ""; } String beginSt = ""; String endSt = ""; Element locationElem = commentElem.getChild("location"); if (locationElem != null) { Element beginElem = commentElem.getChild("begin"); Element endElem = commentElem.getChild("end"); if (beginElem != null) { beginSt = beginElem.getAttributeValue("position"); } if (endElem != null) { endSt = endElem.getAttributeValue("position"); } } massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.STATUS_PROPERTY, commentStatusSt); massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.EVIDENCE_PROPERTY, commentEvidenceSt); massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.TEXT_PROPERTY, commentTextSt); massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.METHOD_PROPERTY, methodSt); massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.MASS_PROPERTY, massSt); massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.BEGIN_PROPERTY, beginSt); massSpectrometryCommentProperties.put(MassSpectrometryCommentRel.END_PROPERTY, endSt); inserter.createRelationship(currentProteinId, commentTypeId, massSpectrometryCommentRel, massSpectrometryCommentProperties); break; } } }
From source file:com.bio4j.neo4jdb.programs.ImportUniprot.java
License:Open Source License
private static String getProteinFullName(Element proteinElement) { if (proteinElement == null) { return ""; } else {// www.j av a2s. com Element recElem = proteinElement.getChild(UniprotStuff.PROTEIN_RECOMMENDED_NAME_TAG_NAME); if (recElem == null) { return ""; } else { return recElem.getChildText(UniprotStuff.PROTEIN_FULL_NAME_TAG_NAME); } } }
From source file:com.bio4j.neo4jdb.programs.ImportUniprot.java
License:Open Source License
private static String getProteinShortName(Element proteinElement) { if (proteinElement == null) { return ""; } else {/*from w ww . j a v a 2s . c o m*/ Element recElem = proteinElement.getChild(UniprotStuff.PROTEIN_RECOMMENDED_NAME_TAG_NAME); if (recElem == null) { return ""; } else { return recElem.getChildText(UniprotStuff.PROTEIN_SHORT_NAME_TAG_NAME); } } }
From source file:com.bio4j.neo4jdb.programs.ImportUniprot.java
License:Open Source License
private static void importProteinCitations(XMLElement entryXMLElem, BatchInserter inserter, BatchInserterIndexProvider indexProvider, long currentProteinId, UniprotDataXML uniprotDataXML) { //-----------------create batch indexes---------------------------------- //---------------------------------------------------------------------- BatchInserterIndex personNameIndex = indexProvider.nodeIndex(PersonNode.PERSON_NAME_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex consortiumNameIndex = indexProvider.nodeIndex(ConsortiumNode.CONSORTIUM_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex thesisTitleIndex = indexProvider.nodeIndex(ThesisNode.THESIS_TITLE_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex instituteNameIndex = indexProvider.nodeIndex(InstituteNode.INSTITUTE_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex countryNameIndex = indexProvider.nodeIndex(CountryNode.COUNTRY_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex cityNameIndex = indexProvider.nodeIndex(CityNode.CITY_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex patentNumberIndex = indexProvider.nodeIndex(PatentNode.PATENT_NUMBER_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex bookNameIndex = indexProvider.nodeIndex(BookNode.BOOK_NAME_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex publisherNameIndex = indexProvider.nodeIndex(PublisherNode.PUBLISHER_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex onlineArticleTitleIndex = indexProvider.nodeIndex( OnlineArticleNode.ONLINE_ARTICLE_TITLE_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex onlineJournalNameIndex = indexProvider.nodeIndex( OnlineJournalNode.ONLINE_JOURNAL_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex submissionTitleIndex = indexProvider.nodeIndex(SubmissionNode.SUBMISSION_TITLE_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex articleTitleIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, FULL_TEXT_ST)); BatchInserterIndex articleDoiIdIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_DOI_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex articlePubmedIdIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_PUBMED_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex articleMedlineIdIndex = indexProvider.nodeIndex(ArticleNode.ARTICLE_MEDLINE_ID_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex journalNameIndex = indexProvider.nodeIndex(JournalNode.JOURNAL_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex dbNameIndex = indexProvider.nodeIndex(DBNode.DB_NAME_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); //---------------------------------------------------------------------- //---------------------------------------------------------------------- List<Element> referenceList = entryXMLElem.asJDomElement().getChildren(UniprotStuff.REFERENCE_TAG_NAME); for (Element reference : referenceList) { List<Element> citationsList = reference.getChildren(UniprotStuff.CITATION_TAG_NAME); for (Element citation : citationsList) { String citationType = citation.getAttributeValue(UniprotStuff.DB_REFERENCE_TYPE_ATTRIBUTE); List<Long> authorsPersonNodesIds = new ArrayList<>(); List<Long> authorsConsortiumNodesIds = new ArrayList<>(); List<Element> authorPersonElems = citation.getChild("authorList").getChildren("person"); List<Element> authorConsortiumElems = citation.getChild("authorList").getChildren("consortium"); for (Element person : authorPersonElems) { //long personId = indexService.getSingleNode(PersonNode.PERSON_NAME_INDEX, person.getAttributeValue("name")); long personId = -1; IndexHits<Long> personNameIndexHits = personNameIndex .get(PersonNode.PERSON_NAME_FULL_TEXT_INDEX, person.getAttributeValue("name")); if (personNameIndexHits.hasNext()) { personId = personNameIndexHits.getSingle(); }//from w ww . java 2 s .c o m personNameIndexHits.close(); if (personId < 0) { personProperties.put(PersonNode.NAME_PROPERTY, person.getAttributeValue("name")); personId = createPersonNode(personProperties, inserter, personNameIndex, nodeTypeIndex); //flushing person name index personNameIndex.flush(); } authorsPersonNodesIds.add(personId); } for (Element consortium : authorConsortiumElems) { long consortiumId = -1; IndexHits<Long> consortiumIdIndexHits = consortiumNameIndex .get(ConsortiumNode.CONSORTIUM_NAME_INDEX, consortium.getAttributeValue("name")); if (consortiumIdIndexHits.hasNext()) { consortiumId = consortiumIdIndexHits.getSingle(); } consortiumIdIndexHits.close(); if (consortiumId < 0) { consortiumProperties.put(ConsortiumNode.NAME_PROPERTY, consortium.getAttributeValue("name")); consortiumId = createConsortiumNode(consortiumProperties, inserter, consortiumNameIndex, nodeTypeIndex); //---flushing consortium name index-- consortiumNameIndex.flush(); } authorsConsortiumNodesIds.add(consortiumId); } //---------------------------------------------------------------------------- //-----------------------------THESIS----------------------------------------- switch (citationType) { case ThesisNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getThesis()) { String dateSt = citation.getAttributeValue("date"); String titleSt = citation.getChildText("title"); if (dateSt == null) { dateSt = ""; } if (titleSt == null) { titleSt = ""; } long thesisId = -1; IndexHits<Long> thesisTitleIndexHits = thesisTitleIndex .get(ThesisNode.THESIS_TITLE_FULL_TEXT_INDEX, titleSt); if (thesisTitleIndexHits.hasNext()) { thesisId = thesisTitleIndexHits.getSingle(); } thesisTitleIndexHits.close(); if (thesisId < 0) { thesisProperties.put(ThesisNode.DATE_PROPERTY, dateSt); thesisProperties.put(ThesisNode.TITLE_PROPERTY, titleSt); //---thesis node creation and indexing thesisId = inserter.createNode(thesisProperties); nodeTypeIndex.add(thesisId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ThesisNode.NODE_TYPE)); thesisTitleIndex.add(thesisId, MapUtil.map(ThesisNode.THESIS_TITLE_FULL_TEXT_INDEX, titleSt)); //flushing thesis title index thesisTitleIndex.flush(); //---authors association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(thesisId, personId, thesisAuthorRel, null); } //-----------institute----------------------------- String instituteSt = citation.getAttributeValue("institute"); String countrySt = citation.getAttributeValue("country"); if (instituteSt != null) { long instituteId = -1; IndexHits<Long> instituteNameIndexHits = instituteNameIndex .get(InstituteNode.INSTITUTE_NAME_INDEX, instituteSt); if (instituteNameIndexHits.hasNext()) { instituteId = instituteNameIndexHits.getSingle(); } instituteNameIndexHits.close(); if (instituteId < 0) { instituteProperties.put(InstituteNode.NAME_PROPERTY, instituteSt); instituteId = createInstituteNode(instituteProperties, inserter, instituteNameIndex, nodeTypeIndex); //flushing institute name index instituteNameIndex.flush(); } if (countrySt != null) { //long countryId = indexService.getSingleNode(CountryNode.COUNTRY_NAME_INDEX, countrySt); long countryId = -1; IndexHits<Long> countryNameIndexHits = countryNameIndex .get(CountryNode.COUNTRY_NAME_INDEX, countrySt); if (countryNameIndexHits.hasNext()) { countryId = countryNameIndexHits.getSingle(); } countryNameIndexHits.close(); if (countryId < 0) { countryProperties.put(CountryNode.NAME_PROPERTY, countrySt); countryId = createCountryNode(countryProperties, inserter, countryNameIndex, nodeTypeIndex); //flushing country name index countryNameIndex.flush(); } inserter.createRelationship(instituteId, countryId, instituteCountryRel, null); } inserter.createRelationship(thesisId, instituteId, thesisInstituteRel, null); } } //--protein citation relationship inserter.createRelationship(thesisId, currentProteinId, thesisProteinCitationRel, null); } //---------------------------------------------------------------------------- //-----------------------------PATENT----------------------------------------- break; case PatentNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getPatents()) { String numberSt = citation.getAttributeValue("number"); String dateSt = citation.getAttributeValue("date"); String titleSt = citation.getChildText("title"); if (dateSt == null) { dateSt = ""; } if (titleSt == null) { titleSt = ""; } if (numberSt == null) { numberSt = ""; } if (!numberSt.equals("")) { long patentId = -1; IndexHits<Long> patentNumberIndexHits = patentNumberIndex .get(PatentNode.PATENT_NUMBER_INDEX, numberSt); if (patentNumberIndexHits.hasNext()) { patentId = patentNumberIndexHits.getSingle(); } patentNumberIndexHits.close(); if (patentId < 0) { patentProperties.put(PatentNode.NUMBER_PROPERTY, numberSt); patentProperties.put(PatentNode.DATE_PROPERTY, dateSt); patentProperties.put(PatentNode.TITLE_PROPERTY, titleSt); //---patent node creation and indexing patentId = inserter.createNode(patentProperties); patentNumberIndex.add(patentId, MapUtil.map(PatentNode.PATENT_NUMBER_INDEX, numberSt)); nodeTypeIndex.add(patentId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PatentNode.NODE_TYPE)); //---flushing patent number index--- patentNumberIndex.flush(); //---authors association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(patentId, personId, patentAuthorRel, null); } } //--protein citation relationship inserter.createRelationship(patentId, currentProteinId, patentProteinCitationRel, null); } } //---------------------------------------------------------------------------- //-----------------------------SUBMISSION----------------------------------------- break; case SubmissionNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getSubmissions()) { String dateSt = citation.getAttributeValue("date"); String titleSt = citation.getChildText("title"); String dbSt = citation.getAttributeValue("db"); if (dateSt == null) { dateSt = ""; } if (titleSt == null) { titleSt = ""; } submissionProperties.put(SubmissionNode.DATE_PROPERTY, dateSt); submissionProperties.put(SubmissionNode.TITLE_PROPERTY, titleSt); long submissionId; IndexHits<Long> submissionTitleIndexHits = submissionTitleIndex .get(SubmissionNode.SUBMISSION_TITLE_INDEX, titleSt); if (submissionTitleIndexHits.hasNext()) { submissionId = submissionTitleIndexHits.getSingle(); submissionTitleIndexHits.close(); } else { //---submission node creation and indexing submissionId = inserter.createNode(submissionProperties); //--indexing node by type--- nodeTypeIndex.add(submissionId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, SubmissionNode.NODE_TYPE)); if (!titleSt.isEmpty()) { //--indexing node by title--- submissionTitleIndex.add(submissionId, MapUtil.map(SubmissionNode.SUBMISSION_TITLE_INDEX, titleSt)); submissionTitleIndex.flush(); } } //---authors association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(submissionId, personId, submissionAuthorRel, null); } //---authors consortium association----- for (long consortiumId : authorsConsortiumNodesIds) { inserter.createRelationship(submissionId, consortiumId, submissionAuthorRel, null); } if (dbSt != null) { long dbId = -1; IndexHits<Long> dbNameIndexHits = dbNameIndex.get(DBNode.DB_NAME_INDEX, dbSt); if (dbNameIndexHits.hasNext()) { dbId = dbNameIndexHits.getSingle(); } dbNameIndexHits.close(); if (dbId < 0) { dbProperties.put(DBNode.NODE_TYPE_PROPERTY, DBNode.NODE_TYPE); dbProperties.put(DBNode.NAME_PROPERTY, dbSt); dbId = createDbNode(dbProperties, inserter, dbNameIndex, nodeTypeIndex); dbNameIndex.flush(); } //-----submission db relationship----- inserter.createRelationship(submissionId, dbId, submissionDbRel, null); } //--protein citation relationship inserter.createRelationship(submissionId, currentProteinId, submissionProteinCitationRel, null); } //---------------------------------------------------------------------------- //-----------------------------BOOK----------------------------------------- break; case BookNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getBooks()) { String nameSt = citation.getAttributeValue("name"); String dateSt = citation.getAttributeValue("date"); String titleSt = citation.getChildText("title"); String publisherSt = citation.getAttributeValue("publisher"); String firstSt = citation.getAttributeValue("first"); String lastSt = citation.getAttributeValue("last"); String citySt = citation.getAttributeValue("city"); String volumeSt = citation.getAttributeValue("volume"); if (nameSt == null) { nameSt = ""; } if (dateSt == null) { dateSt = ""; } if (titleSt == null) { titleSt = ""; } if (publisherSt == null) { publisherSt = ""; } if (firstSt == null) { firstSt = ""; } if (lastSt == null) { lastSt = ""; } if (citySt == null) { citySt = ""; } if (volumeSt == null) { volumeSt = ""; } long bookId = -1; IndexHits<Long> bookNameIndexHits = bookNameIndex.get(BookNode.BOOK_NAME_FULL_TEXT_INDEX, nameSt); if (bookNameIndexHits.hasNext()) { bookId = bookNameIndexHits.getSingle(); } bookNameIndexHits.close(); if (bookId < 0) { bookProperties.put(BookNode.NAME_PROPERTY, nameSt); bookProperties.put(BookNode.DATE_PROPERTY, dateSt); //---book node creation and indexing bookId = inserter.createNode(bookProperties); bookNameIndex.add(bookId, MapUtil.map(BookNode.BOOK_NAME_FULL_TEXT_INDEX, nameSt)); //--indexing node by type--- nodeTypeIndex.add(bookId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, BookNode.NODE_TYPE)); //--flushing book name index--- bookNameIndex.flush(); //---authors association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(bookId, personId, bookAuthorRel, null); } //---editor association----- Element editorListElem = citation.getChild("editorList"); if (editorListElem != null) { List<Element> editorsElems = editorListElem.getChildren("person"); for (Element person : editorsElems) { //long editorId = indexService.getSingleNode(PersonNode.PERSON_NAME_INDEX, person.getAttributeValue("name")); long editorId = -1; IndexHits<Long> personNameIndexHits = personNameIndex.get( PersonNode.PERSON_NAME_FULL_TEXT_INDEX, person.getAttributeValue("name")); if (personNameIndexHits.hasNext()) { editorId = personNameIndexHits.getSingle(); } personNameIndexHits.close(); if (editorId < 0) { personProperties.put(PersonNode.NAME_PROPERTY, person.getAttributeValue("name")); editorId = createPersonNode(personProperties, inserter, personNameIndex, nodeTypeIndex); } //---flushing person name index--- personNameIndex.flush(); //editor association inserter.createRelationship(bookId, editorId, bookEditorRel, null); } } //----publisher-- if (!publisherSt.equals("")) { //long publisherId = indexService.getSingleNode(PublisherNode.PUBLISHER_NAME_INDEX, publisherSt); long publisherId = -1; IndexHits<Long> publisherNameIndexHits = publisherNameIndex .get(PublisherNode.PUBLISHER_NAME_INDEX, publisherSt); if (publisherNameIndexHits.hasNext()) { publisherId = publisherNameIndexHits.getSingle(); } publisherNameIndexHits.close(); if (publisherId < 0) { publisherProperties.put(PublisherNode.NAME_PROPERTY, publisherSt); publisherId = inserter.createNode(publisherProperties); //--indexing node by type--- nodeTypeIndex.add(publisherId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, PublisherNode.NODE_TYPE)); publisherNameIndex.add(publisherId, MapUtil.map(PublisherNode.PUBLISHER_NAME_INDEX, publisherSt)); //--flushing publisher name index-- publisherNameIndex.flush(); } inserter.createRelationship(bookId, publisherId, bookPublisherRel, null); } //-----city----- if (!citySt.equals("")) { //long cityId = indexService.getSingleNode(CityNode.CITY_NAME_INDEX, citySt); long cityId = -1; IndexHits<Long> cityNameIndexHits = cityNameIndex.get(CityNode.CITY_NAME_INDEX, citySt); if (cityNameIndexHits.hasNext()) { cityId = cityNameIndexHits.getSingle(); } cityNameIndexHits.close(); if (cityId < 0) { cityProperties.put(CityNode.NAME_PROPERTY, citySt); cityId = createCityNode(cityProperties, inserter, cityNameIndex, nodeTypeIndex); //-----flushing city name index--- cityNameIndex.flush(); } inserter.createRelationship(bookId, cityId, bookCityRel, null); } } bookProteinCitationProperties.put(BookProteinCitationRel.FIRST_PROPERTY, firstSt); bookProteinCitationProperties.put(BookProteinCitationRel.LAST_PROPERTY, lastSt); bookProteinCitationProperties.put(BookProteinCitationRel.VOLUME_PROPERTY, volumeSt); bookProteinCitationProperties.put(BookProteinCitationRel.TITLE_PROPERTY, titleSt); //--protein citation relationship inserter.createRelationship(bookId, currentProteinId, bookProteinCitationRel, bookProteinCitationProperties); } //---------------------------------------------------------------------------- //-----------------------------ONLINE ARTICLE----------------------------------------- break; case OnlineArticleNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getOnlineArticles()) { String locatorSt = citation.getChildText("locator"); String nameSt = citation.getAttributeValue("name"); String titleSt = citation.getChildText("title"); if (titleSt == null) { titleSt = ""; } if (nameSt == null) { nameSt = ""; } if (locatorSt == null) { locatorSt = ""; } long onlineArticleId = -1; IndexHits<Long> onlineArticleTitleIndexHits = onlineArticleTitleIndex .get(OnlineArticleNode.ONLINE_ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt); if (onlineArticleTitleIndexHits.hasNext()) { onlineArticleId = onlineArticleTitleIndexHits.getSingle(); } onlineArticleTitleIndexHits.close(); if (onlineArticleId < 0) { onlineArticleProperties.put(OnlineArticleNode.TITLE_PROPERTY, titleSt); onlineArticleId = inserter.createNode(onlineArticleProperties); //--indexing node by type--- nodeTypeIndex.add(onlineArticleId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, OnlineArticleNode.NODE_TYPE)); if (!titleSt.equals("")) { onlineArticleTitleIndex.add(onlineArticleId, MapUtil .map(OnlineArticleNode.ONLINE_ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt)); //-----flushing online article title index--- onlineArticleTitleIndex.flush(); } //---authors person association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(onlineArticleId, personId, onlineArticleAuthorRel, null); } //---authors consortium association----- for (long consortiumId : authorsConsortiumNodesIds) { inserter.createRelationship(onlineArticleId, consortiumId, onlineArticleAuthorRel, null); } //------online journal----------- if (!nameSt.equals("")) { long onlineJournalId = -1; IndexHits<Long> onlineJournalNameIndexHits = onlineJournalNameIndex .get(OnlineJournalNode.ONLINE_JOURNAL_NAME_INDEX, nameSt); if (onlineJournalNameIndexHits.hasNext()) { onlineJournalId = onlineJournalNameIndexHits.getSingle(); } onlineJournalNameIndexHits.close(); if (onlineJournalId < 0) { onlineJournalProperties.put(OnlineJournalNode.NAME_PROPERTY, nameSt); onlineJournalId = inserter.createNode(onlineJournalProperties); //--indexing node by type--- nodeTypeIndex.add(onlineJournalId, MapUtil .map(Bio4jManager.NODE_TYPE_INDEX_NAME, OnlineJournalNode.NODE_TYPE)); onlineJournalNameIndex.add(onlineJournalId, MapUtil.map(OnlineJournalNode.ONLINE_JOURNAL_NAME_INDEX, nameSt)); //---flushing online journal name index--- onlineJournalNameIndex.flush(); } onlineArticleJournalProperties.put(OnlineArticleJournalRel.LOCATOR_PROPERTY, locatorSt); inserter.createRelationship(onlineArticleId, onlineJournalId, onlineArticleJournalRel, onlineArticleJournalProperties); } //---------------------------- } //protein citation inserter.createRelationship(onlineArticleId, currentProteinId, onlineArticleProteinCitationRel, null); } //---------------------------------------------------------------------------- //-----------------------------ARTICLE----------------------------------------- break; case ArticleNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getArticles()) { String journalNameSt = citation.getAttributeValue("name"); String dateSt = citation.getAttributeValue("date"); String titleSt = citation.getChildText("title"); String firstSt = citation.getAttributeValue("first"); String lastSt = citation.getAttributeValue("last"); String volumeSt = citation.getAttributeValue("volume"); String doiSt = ""; String medlineSt = ""; String pubmedSt = ""; if (journalNameSt == null) { journalNameSt = ""; } if (dateSt == null) { dateSt = ""; } if (firstSt == null) { firstSt = ""; } if (lastSt == null) { lastSt = ""; } if (volumeSt == null) { volumeSt = ""; } if (titleSt == null) { titleSt = ""; } List<Element> dbReferences = citation.getChildren("dbReference"); for (Element tempDbRef : dbReferences) { switch (tempDbRef.getAttributeValue("type")) { case "DOI": doiSt = tempDbRef.getAttributeValue("id"); break; case "MEDLINE": medlineSt = tempDbRef.getAttributeValue("id"); break; case "PubMed": pubmedSt = tempDbRef.getAttributeValue("id"); break; } } //long articleId = indexService.getSingleNode(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt); long articleId = -1; IndexHits<Long> articleTitleIndexHits = articleTitleIndex .get(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt); if (articleTitleIndexHits.hasNext()) { articleId = articleTitleIndexHits.getSingle(); } articleTitleIndexHits.close(); if (articleId < 0) { articleProperties.put(ArticleNode.TITLE_PROPERTY, titleSt); articleProperties.put(ArticleNode.DOI_ID_PROPERTY, doiSt); articleProperties.put(ArticleNode.MEDLINE_ID_PROPERTY, medlineSt); articleProperties.put(ArticleNode.PUBMED_ID_PROPERTY, pubmedSt); articleId = inserter.createNode(articleProperties); //--indexing node by type--- nodeTypeIndex.add(articleId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, ArticleNode.NODE_TYPE)); if (!titleSt.equals("")) { articleTitleIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_TITLE_FULL_TEXT_INDEX, titleSt)); //--flushing article title index--- articleTitleIndex.flush(); } //---indexing by medline, doi and pubmed-- if (!doiSt.isEmpty()) { articleDoiIdIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_DOI_ID_INDEX, doiSt)); } if (!medlineSt.isEmpty()) { articleMedlineIdIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_MEDLINE_ID_INDEX, medlineSt)); } if (!pubmedSt.isEmpty()) { articlePubmedIdIndex.add(articleId, MapUtil.map(ArticleNode.ARTICLE_PUBMED_ID_INDEX, pubmedSt)); } //---authors person association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(articleId, personId, articleAuthorRel, null); } //---authors consortium association----- for (long consortiumId : authorsConsortiumNodesIds) { inserter.createRelationship(articleId, consortiumId, articleAuthorRel, null); } //------journal----------- if (!journalNameSt.equals("")) { //long journalId = indexService.getSingleNode(JournalNode.JOURNAL_NAME_INDEX, journalNameSt); long journalId = -1; IndexHits<Long> journalNameIndexHits = journalNameIndex .get(JournalNode.JOURNAL_NAME_INDEX, journalNameSt); if (journalNameIndexHits.hasNext()) { journalId = journalNameIndexHits.getSingle(); } journalNameIndexHits.close(); if (journalId < 0) { journalProperties.put(JournalNode.NAME_PROPERTY, journalNameSt); journalId = inserter.createNode(journalProperties); //--indexing node by type--- nodeTypeIndex.add(journalId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, JournalNode.NODE_TYPE)); journalNameIndex.add(journalId, MapUtil.map(JournalNode.JOURNAL_NAME_INDEX, journalNameSt)); //----flushing journal name index---- journalNameIndex.flush(); } articleJournalProperties.put(ArticleJournalRel.DATE_PROPERTY, dateSt); articleJournalProperties.put(ArticleJournalRel.FIRST_PROPERTY, firstSt); articleJournalProperties.put(ArticleJournalRel.LAST_PROPERTY, lastSt); articleJournalProperties.put(ArticleJournalRel.VOLUME_PROPERTY, volumeSt); inserter.createRelationship(articleId, journalId, articleJournalRel, articleJournalProperties); } //---------------------------- } //protein citation inserter.createRelationship(articleId, currentProteinId, articleProteinCitationRel, null); } //---------------------------------------------------------------------------- //----------------------UNPUBLISHED OBSERVATIONS----------------------------------------- break; case UnpublishedObservationNode.UNIPROT_ATTRIBUTE_TYPE_VALUE: if (uniprotDataXML.getUnpublishedObservations()) { String dateSt = citation.getAttributeValue("date"); if (dateSt == null) { dateSt = ""; } unpublishedObservationProperties.put(UnpublishedObservationNode.DATE_PROPERTY, dateSt); long unpublishedObservationId = inserter.createNode(unpublishedObservationProperties); //--indexing node by type--- nodeTypeIndex.add(unpublishedObservationId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, UnpublishedObservationNode.NODE_TYPE)); //---authors person association----- for (long personId : authorsPersonNodesIds) { inserter.createRelationship(unpublishedObservationId, personId, unpublishedObservationAuthorRel, null); } inserter.createRelationship(unpublishedObservationId, currentProteinId, unpublishedObservationProteinCitationRel, null); } break; } } } }
From source file:com.bio4j.neo4jdb.programs.ImportUniref.java
License:Open Source License
private static String getRepresentantAccession(Element elem) { String result = null;//from ww w .ja va2 s . c om Element dbReference = elem.getChild("dbReference"); List<Element> properties = dbReference.getChildren("property"); for (Element prop : properties) { if (prop.getAttributeValue("type").equals("UniProtKB accession")) { result = prop.getAttributeValue("value"); } } return result; }
From source file:com.bio4j.neo4jdb.programs.ImportUniref.java
License:Open Source License
private static int importUnirefFile(BatchInserter inserter, BatchInserterIndex proteinAccessionIndex, BatchInserterIndex isoformIdIndex, File unirefFile, BasicRelationship relationship) throws Exception { StringBuilder entryStBuilder = new StringBuilder(); BufferedReader reader = new BufferedReader(new FileReader(unirefFile)); String line;/*from w w w . ja va2 s . c o m*/ int entryCounter = 0; int limitForPrintingOut = 10000; while ((line = reader.readLine()) != null) { //----we reached a entry line----- if (line.trim().startsWith("<" + UniprotStuff.ENTRY_TAG_NAME)) { while (!line.trim().startsWith("</" + UniprotStuff.ENTRY_TAG_NAME + ">")) { entryStBuilder.append(line); line = reader.readLine(); } //organism last line entryStBuilder.append(line); XMLElement entryXMLElem = new XMLElement(entryStBuilder.toString()); entryStBuilder.delete(0, entryStBuilder.length()); ArrayList<String> membersAccessionList = new ArrayList<String>(); Element representativeMember = entryXMLElem.asJDomElement().getChild("representativeMember"); String representantAccession = getRepresentantAccession(representativeMember); List<Element> members = entryXMLElem.asJDomElement().getChildren("member"); for (Element member : members) { Element memberDbReference = member.getChild("dbReference"); List<Element> memberProperties = memberDbReference.getChildren("property"); for (Element prop : memberProperties) { if (prop.getAttributeValue("type").equals("UniProtKB accession")) { String memberAccession = prop.getAttributeValue("value"); membersAccessionList.add(memberAccession); } } } if (representantAccession != null) { long representantId = -1; //---The representant is an isoform---- if (representantAccession.contains("-")) { IndexHits<Long> repIndexHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, representantAccession); if (repIndexHits.size() == 1) { representantId = repIndexHits.getSingle(); } repIndexHits.close(); } //---The representant is a protein else { IndexHits<Long> hits = proteinAccessionIndex.get(ProteinNode.PROTEIN_ACCESSION_INDEX, representantAccession); if (hits.size() == 1) { //System.out.println("representantAccession = " + representantAccession); representantId = hits.getSingle(); } hits.close(); } //----we only create the relationships in the case where we found // a valid representant id----- if (representantId >= 0) { for (String memberAccession : membersAccessionList) { long memberId = -1; if (memberAccession.contains("-")) { IndexHits<Long> isoHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, memberAccession); if (isoHits.size() == 1) { memberId = isoHits.getSingle(); } isoHits.close(); } else { IndexHits<Long> protHits = proteinAccessionIndex .get(ProteinNode.PROTEIN_ACCESSION_INDEX, memberAccession); if (protHits.size() == 1) { memberId = protHits.getSingle(); } protHits.close(); } if (memberId >= 0) { inserter.createRelationship(representantId, memberId, relationship, null); } } } } else { logger.log(Level.SEVERE, ("null representant accession for entry: " + entryXMLElem.asJDomElement().getAttributeValue("id"))); } } entryCounter++; if ((entryCounter % limitForPrintingOut) == 0) { logger.log(Level.INFO, (entryCounter + " entries parsed!!")); } } reader.close(); return entryCounter; }
From source file:com.cats.version.VersionCfgParseAndSave.java
License:Apache License
public List<VersionInfo> getVersionInfo(String fullPath) { SAXBuilder builder = new SAXBuilder(); List<VersionInfo> infos = new ArrayList<VersionInfo>(); try {//w w w . j a va 2s . c om Document doc = builder.build(new File(fullPath)); Element root = doc.getRootElement(); List<Element> softEles = root.getChildren("software"); for (Element softEle : softEles) { String appName = softEle.getAttribute("name").getValue(); String versionCode = softEle.getChildText("latest-version-code"); String versionName = softEle.getChildText("latest-version"); String versionPath = softEle.getChildText("latest-version-abspath"); String startupName = softEle.getChildText("latest-version-startup"); Element detailEles = softEle.getChild("latest-version-detail"); List<Element> detailItemEles = detailEles.getChildren("item"); List<VersionInfoDetail> details = new ArrayList<VersionInfoDetail>(); for (Element detailItem : detailItemEles) { String title = detailItem.getAttributeValue("name"); List<Element> detailEleList = detailItem.getChildren("detail"); List<String> detailList = new ArrayList<String>(); for (Element detailEle : detailEleList) { String strDetail = detailEle.getText(); detailList.add(strDetail); } details.add(new VersionInfoDetail(title, detailList)); } Element ignoreEles = softEle.getChild("ignore-files"); List<String> ignoreFiles = new ArrayList<String>(); if (ignoreEles != null) { List<Element> ignoreItems = ignoreEles.getChildren("item"); for (Element ignoreItem : ignoreItems) { ignoreFiles.add(ignoreItem.getText()); } } VersionInfo versionInfo = new VersionInfo(); versionInfo.setAppName(appName); versionInfo.setVersion(versionName); versionInfo.setStartupName(startupName); versionInfo.setVersionCode(Integer.parseInt(versionCode)); versionInfo.setPath(versionPath); versionInfo.setDetails(details); versionInfo.setIgnoreFiles(ignoreFiles); infos.add(versionInfo); } } catch (Exception e) { e.printStackTrace(); return null; } return infos; }
From source file:com.cedarsoft.serialization.test.performance.XmlParserPerformance.java
License:Open Source License
public void benchJdom() { runBenchmark(new Runnable() { @Override//from www. j a va 2 s. c om public void run() { try { for (int i = 0; i < SMALL; i++) { Document doc = new SAXBuilder().build(new StringReader(CONTENT_SAMPLE)); Element fileTypeElement = doc.getRootElement(); Element extensionElement = fileTypeElement.getChild("extension"); Extension extension = new Extension(extensionElement.getAttributeValue("delimiter"), extensionElement.getText(), extensionElement.getAttribute("default").getBooleanValue()); FileType fileType = new FileType(fileTypeElement.getChildText("id"), extension, fileTypeElement.getAttribute("dependent").getBooleanValue()); assertNotNull(fileType); } } catch (Exception e) { throw new RuntimeException(e); } } }, 4); }