Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.loader.entrez.pubmed; import java.io.IOException; import java.io.InputStream; import java.text.DateFormat; import java.text.ParseException; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.time.DateUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import ubic.gemma.model.common.description.BibliographicReference; import ubic.gemma.model.common.description.DatabaseEntry; import ubic.gemma.model.common.description.ExternalDatabase; import ubic.gemma.model.common.description.Keyword; import ubic.gemma.model.common.description.MedicalSubjectHeading; import ubic.gemma.model.common.description.PublicationType; import ubic.gemma.model.expression.biomaterial.Compound; /** * Simple class to parse XML in the format defined by {@link http * ://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_041101.dtd}. The resulting BibliographicReference object is * associated with (transient) DatabaseEntry, in turn to a (transient) ExternalDatabase and MeSH. * * @author pavlidis * @version $Id: PubMedXMLParser.java,v 1.24 2013/05/29 04:11:29 paul Exp $ */ public class PubMedXMLParser { private static final String ERROR_TAG = "Error"; private static final String PUB_MED_EXTERNAL_DB_NAME = "PubMed"; protected static final Log log = LogFactory.getLog(PubMedXMLParser.class); DocumentBuilder builder; DateFormat df = DateFormat.getDateInstance(DateFormat.MEDIUM); private final String[] formats = new String[] { "MMM dd, yyyy", "yyyy" }; /** * @param bibRef * @param item * @throws IOException */ public void extractBookPublicationYear(BibliographicReference bibRef, Node item) throws IOException { NodeList c = item.getChildNodes(); for (int i = 0; i < c.getLength(); i++) { Node a = c.item(i); if (!(a instanceof Element)) { continue; } if (a.getNodeName().equals("Year")) { try { bibRef.setPublicationDate(DateUtils.parseDate(XMLUtils.getTextValue((Element) a), formats)); } catch (ParseException e) { log.warn("Could not extract date of publication from : " + XMLUtils.getTextValue((Element) a)); } } } } /** * @param bibRef * @param item * @throws IOException */ public void extractPublisher(BibliographicReference bibRef, Node item) throws IOException { NodeList c = item.getChildNodes(); for (int i = 0; i < c.getLength(); i++) { Node a = c.item(i); if (!(a instanceof Element)) { continue; } if (a.getNodeName().equals("PublisherName")) { bibRef.setPublisher(XMLUtils.getTextValue((Element) a)); } else if (a.getNodeName().equals("PublisherLocation")) { bibRef.setPublisher(bibRef.getPublisher() + " [ " + XMLUtils.getTextValue((Element) a) + "]"); } } } /** * @param is * @return */ public Collection<BibliographicReference> parse(InputStream is) { try { if (is.available() == 0) { throw new IOException("XML stream contains no data."); } DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setIgnoringComments(true); factory.setValidating(false); builder = factory.newDocumentBuilder(); Document document = builder.parse(is); log.debug("done parsing"); return extractBibRefs(document); } catch (IOException e) { throw new RuntimeException(e); } catch (ParserConfigurationException e) { throw new RuntimeException(e); } catch (SAXException e) { throw new RuntimeException(e); } } /** * @param doc * @return * @throws IOException */ private String extractAuthorList(NodeList authorList) throws IOException { StringBuilder al = new StringBuilder(); for (int i = 0; i < authorList.getLength(); i++) { Node item = authorList.item(i); if (!(item instanceof Element)) { continue; } NodeList nl = item.getChildNodes(); for (int j = 0; j < nl.getLength(); j++) { Node m = nl.item(j); if (m instanceof Element) { Element f = (Element) m; String nodeName = f.getNodeName(); if (nodeName.equals("LastName")) { al.append(XMLUtils.getTextValue(f)); al.append(", "); } else if (nodeName.equals("ForeName")) { al.append(XMLUtils.getTextValue(f)); al.append("; "); } else if (nodeName.equals("Initials")) { // noop ; } else if (nodeName.equals("CollectiveName")) { al.append(XMLUtils.getTextValue(f)); al.append("; "); } } } } if (al.length() == 0) return "(No authors listed)"; if (al.length() < 3) return al.toString(); return al.toString().substring(0, al.length() - 2); // trim trailing semicolon + space. } /** * @param doc * @return * @throws IOException */ private Collection<BibliographicReference> extractBibRefs(Document document) throws IOException { // Was there an error? (not found) if (document.getElementsByTagName(ERROR_TAG).getLength() > 0) { return null; } NodeList articles = document.getElementsByTagName("MedlineCitation"); if (articles.getLength() == 0) { // mebbe it is a book? articles = document.getElementsByTagName("BookDocument"); if (articles.getLength() > 0) { return parseBookArticles(articles); } return new HashSet<BibliographicReference>(); } Collection<BibliographicReference> result = new HashSet<BibliographicReference>(); log.debug(articles.getLength() + " articles found in document"); int i = 0; for (; i < articles.getLength(); i++) { BibliographicReference bibRef = BibliographicReference.Factory.newInstance(); Node record = articles.item(i); Node article = processRecord(bibRef, record); assert article != null; Node journal = processArticle(bibRef, article); processJournalInfo(bibRef, journal); result.add(bibRef); if (i > 0 && i % 1000 == 0) { log.info("Processed " + i + " articles"); } } log.info("Processed " + i + " articles"); return result; } /** * @param chemNodes * @return * @throws TransformerException * @throws IOException */ private Collection<Compound> extractChemicals(Node chemNodes) throws IOException { Collection<Compound> compounds = new HashSet<Compound>(); NodeList childNodes = chemNodes.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node chemNode = childNodes.item(i); NodeList termNodes = chemNode.getChildNodes(); if (termNodes.getLength() == 0) continue; Compound c = Compound.Factory.newInstance(); for (int j = 0; j < termNodes.getLength(); j++) { Node item = termNodes.item(j); if (!(item instanceof Element)) { continue; } Element el = (Element) item; if (el.getNodeName().equals("RegistryNumber")) { String regString = XMLUtils.getTextValue(el); c.setRegistryNumber(regString); } else { String txt = XMLUtils.getTextValue(el); c.setName(txt); } } log.debug(c.getName()); compounds.add(c); } return compounds; } /** * @param dateNode * @return * @throws TransformerException * @throws IOException */ private Date extractJournalIssueDate(Node dateNode) throws IOException { String yearText = null;// = XMLUtils.getTextValue( ( Element ) y ); String medLineText = null;// = XMLUtils.getTextValue( ( Element ) medLineDate ); String monthText = null;// = XMLUtils.getTextValue( ( Element ) m ); String dayText = null;// = XMLUtils.getTextValue( ( Element ) dn ); NodeList childNodes = dateNode.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node c = childNodes.item(i); if (!(c instanceof Element)) { continue; } String t = XMLUtils.getTextValue((Element) c); if (c.getNodeName().equals("Year")) { yearText = t; } else if (c.getNodeName().equals("Month")) { monthText = t; } else if (c.getNodeName().equals("Day")) { dayText = t; } else if (c.getNodeName().equals("MedlineDate")) { medLineText = t; } } df.setLenient(true); if (yearText == null && medLineText != null) { String[] yearmo = medLineText.split("\\s"); if (yearmo.length == 2) { // 1983 Aug yearText = yearmo[0]; monthText = yearmo[1]; monthText = monthText.replaceAll("-\\w+", ""); } else if (yearmo.length == 4) { // 1983 Aug 31-Sep 6 yearText = yearmo[0]; monthText = yearmo[1]; dayText = yearmo[2].replaceAll("-\\w+", ""); } else if (yearmo.length == 3) { // 1983 Jul 9-16 yearText = yearmo[0]; monthText = yearmo[1]; dayText = yearmo[2].replaceAll("-\\w+", ""); } else if (yearmo.length == 1) { // 1983-84 yearText = yearmo[0]; yearText = yearText.replaceAll("-\\w+", ""); } else { log.warn("No data information from medline text: " + medLineText); } } if (monthText == null) { monthText = "Jan"; // arbitrary... } String dateString = monthText + " " + (dayText == null ? "1" : dayText) + ", " + yearText; try { return DateUtils.parseDate(dateString, formats); } catch (ParseException e) { log.warn("Could not parse date " + dateString + " from medlinetext=" + medLineText); return null; } } /** * @param keywordNode * @return * @throws TransformerException * @throws IOException */ private Collection<Keyword> extractKeywords(Node keywordNode) throws IOException { Collection<Keyword> keywords = new HashSet<Keyword>(); NodeList childNodes = keywordNode.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node item = childNodes.item(i); if (!(item instanceof Element)) { continue; } Element el = (Element) item; String keyword = XMLUtils.getTextValue(el); Boolean isMajor = isMajorHeading(item); Keyword kw = Keyword.Factory.newInstance(); kw.setTerm(keyword); kw.setIsMajorTopic(isMajor); keywords.add(kw); } return keywords; } /** * Get the date this was put in pubmed. * * @param dateNode * @return * @throws IOException */ private Date extractPublicationDate(Node dateNode) throws IOException { Date d = extractJournalIssueDate(dateNode); // if ( d == null ) d = extractPubmedPubdate( dateNode ); return d; } /** * @param article * @return * @throws TransformerException * @throws IOException */ private Collection<PublicationType> extractPublicationTypes(Node pubtypeList) throws IOException { Collection<PublicationType> publicationTypes = new HashSet<PublicationType>(); NodeList childNodes = pubtypeList.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node item = childNodes.item(i); if (!(item instanceof Element)) { continue; } String type = XMLUtils.getTextValue((Element) item); PublicationType pt = PublicationType.Factory.newInstance(); pt.setType(type); publicationTypes.add(pt); } return publicationTypes; } private boolean isMajorHeading(Node descriptor) { Attr dmajorTopic = (Attr) descriptor.getAttributes().getNamedItem("MajorTopicYN"); return dmajorTopic.getValue().equals("Y"); } /** * @param articles * @return */ private Collection<BibliographicReference> parseBookArticles(NodeList articles) throws IOException { Collection<BibliographicReference> result = new HashSet<BibliographicReference>(); int i = 0; for (; i < articles.getLength(); i++) { BibliographicReference bibRef = BibliographicReference.Factory.newInstance(); Node record = articles.item(i); processBookRecord(bibRef, record); result.add(bibRef); if (i > 0 && i % 1000 == 0) { log.info("Processed " + i + " books"); } } log.info("Processed " + i + " books"); return result; } private void processAccession(BibliographicReference bibRef, Node record) throws IOException { String accession = XMLUtils.getTextValue((Element) record); DatabaseEntry dbEntry = DatabaseEntry.Factory.newInstance(); dbEntry.setAccession(accession); ExternalDatabase exDb = ExternalDatabase.Factory.newInstance(); exDb.setName(PUB_MED_EXTERNAL_DB_NAME); dbEntry.setExternalDatabase(exDb); bibRef.setPubAccession(dbEntry); } private Node processArticle(BibliographicReference bibRef, Node article) throws IOException { NodeList childNodes = article.getChildNodes(); Node journal = null; for (int j = 0; j < childNodes.getLength(); j++) { Node item = childNodes.item(j); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); if (name.equals("ArticleTitle")) { bibRef.setTitle(XMLUtils.getTextValue((Element) item)); } else if (name.equals("Journal")) { journal = item; } else if (name.equals("AuthorList")) { bibRef.setAuthorList(extractAuthorList(item.getChildNodes())); } else if (name.equals("Pagination")) { bibRef.setPages(XMLUtils.extractOneChild(item, "MedlinePgn")); } else if (name.equals("Abstract")) { // abstracts can have parts List<String> abstractParts = XMLUtils.extractMultipleChildren(item, "AbstractText"); if (abstractParts.size() > 1) { StringBuilder buf = new StringBuilder(); NodeList jNodes = item.getChildNodes(); for (int q = 0; q < jNodes.getLength(); q++) { Node jitem = jNodes.item(q); if (!(jitem instanceof Element)) { continue; } if (jitem.getNodeName().equals("AbstractText")) { String label = jitem.getAttributes().getNamedItem("Label").getTextContent(); String part = jitem.getTextContent(); if (StringUtils.isNotBlank(label)) { buf.append(label + ": " + part + "\n"); } else { buf.append(part + "\n"); } } } bibRef.setAbstractText(buf.toString()); } else { bibRef.setAbstractText(abstractParts.iterator().next()); } } else if (name.equals("PublicationTypeList")) { bibRef.setPublicationTypes(extractPublicationTypes(item)); } } return journal; } private void processBookInfo(BibliographicReference bibRef, Node article) throws IOException { NodeList childNodes = article.getChildNodes(); for (int j = 0; j < childNodes.getLength(); j++) { Node item = childNodes.item(j); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); if (name.equals("Publisher")) { extractPublisher(bibRef, item); } else if (name.equals("PubDate") && bibRef.getPublicationDate() == null) { extractBookPublicationYear(bibRef, item); } else if (name.equals("AuthorList")) { if (((Element) item).hasAttribute("Type")) { if (((Element) item).getAttribute("Type").equals("editors")) { bibRef.setEditor(extractAuthorList(item.getChildNodes())); } else { bibRef.setAuthorList(extractAuthorList(item.getChildNodes())); } } } else if (name.equals("BookTitle")) { if (bibRef.getTitle() == null) bibRef.setTitle(XMLUtils.getTextValue((Element) item)); bibRef.setPublication(XMLUtils.getTextValue((Element) item)); } } } /** * Fill in information about the book: Publisher, Editor(s), Publication year * * @param bibRef * @param record * @return * @throws IOException */ private void processBookRecord(BibliographicReference bibRef, Node record) throws IOException { NodeList recordNodes = record.getChildNodes(); for (int p = 0; p < recordNodes.getLength(); p++) { Node item = recordNodes.item(p); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); if (name.equals("ArticleTitle")) { // this is the title of the chapter. bibRef.setTitle(StringUtils.strip(XMLUtils.getTextValue((Element) item))); } else if (name.equals("Book")) { processBookInfo(bibRef, item); } else if (name.equals("AuthorList")) { bibRef.setAuthorList(extractAuthorList(item.getChildNodes())); } else if (name.equals("Abstract")) { bibRef.setAbstractText(""); NodeList abstractTextSections = item.getChildNodes(); for (int q = 0; q < abstractTextSections.getLength(); q++) { Node jitem = abstractTextSections.item(q); if (!(jitem instanceof Element)) { continue; } if (jitem.getNodeName().equals("AbstractText")) { bibRef.setAbstractText( bibRef.getAbstractText() + (XMLUtils.getTextValue((Element) jitem)) + " "); } bibRef.setAbstractText(bibRef.getAbstractText().trim()); } } else if (name.equals("PMID")) { processAccession(bibRef, item); } else if (name.equals("ContributionDate")) { /* * Unusual, but happens for books that are updated with new sections. We use this instead of the * publication date. */ extractBookPublicationYear(bibRef, item); } } } /** * @param bibRef * @param journal * @return * @throws IOException */ private NodeList processJournalInfo(BibliographicReference bibRef, Node journal) throws IOException { NodeList journalNodes = journal.getChildNodes(); for (int j = 0; j < journalNodes.getLength(); j++) { Node item = journalNodes.item(j); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); if (name.equals("JournalIssue")) { NodeList journalIssueNodes = item.getChildNodes(); for (int k = 0; k < journalIssueNodes.getLength(); k++) { Node jitem = journalIssueNodes.item(k); if (!(jitem instanceof Element)) { continue; } String jname = jitem.getNodeName(); if (jname.equals("Volume")) { bibRef.setVolume(XMLUtils.getTextValue((Element) jitem)); } else if (jname.equals("Issue")) { bibRef.setIssue(XMLUtils.getTextValue((Element) jitem)); } else if (jname.equals("PubDate")) { bibRef.setPublicationDate(extractPublicationDate(jitem)); } } } } return journalNodes; } /** * @param meshHeadings * @param bibRef * @throws TransformerException * @throws IOException */ private void processMESH(Node meshHeadings, BibliographicReference bibRef) throws IOException { NodeList childNodes = meshHeadings.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node meshNode = childNodes.item(i); NodeList termNodes = meshNode.getChildNodes(); MedicalSubjectHeading vc = MedicalSubjectHeading.Factory.newInstance(); if (termNodes.getLength() == 0) continue; // these might just be a single Descriptor or a Descriptor with Qualifiers. for (int j = 0; j < termNodes.getLength(); j++) { Node item = termNodes.item(j); if (!(item instanceof Element)) { continue; } Element descriptor = (Element) item; if (descriptor.getNodeName().equals("DescriptorName")) { String d = XMLUtils.getTextValue(descriptor); boolean dmajorB = isMajorHeading(descriptor); vc.setTerm(d); vc.setIsMajorTopic(dmajorB); } else { MedicalSubjectHeading qual = MedicalSubjectHeading.Factory.newInstance(); String q = XMLUtils.getTextValue(descriptor); boolean qmajorB = isMajorHeading(descriptor); qual.setIsMajorTopic(qmajorB); qual.setTerm(q); vc.getQualifiers().add(qual); } } bibRef.getMeshTerms().add(vc); } } private Node processRecord(BibliographicReference bibRef, Node record) throws IOException { Node article = null; NodeList recordNodes = record.getChildNodes(); for (int p = 0; p < recordNodes.getLength(); p++) { Node item = recordNodes.item(p); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); if (name.equals("Article")) { article = item; } else if (name.equals("ChemicalList")) { bibRef.setChemicals(extractChemicals(item)); } else if (name.equals("MeshHeadingList")) { processMESH(item, bibRef); } else if (name.equals("KeywordList")) { bibRef.setKeywords(extractKeywords(item)); } else if (name.equals("MedlineJournalInfo")) { NodeList jNodes = item.getChildNodes(); for (int q = 0; q < jNodes.getLength(); q++) { Node jitem = jNodes.item(q); if (!(jitem instanceof Element)) { continue; } if (jitem.getNodeName().equals("MedlineTA")) { bibRef.setPublication(XMLUtils.getTextValue((Element) jitem)); } } } else if (name.equals("PMID")) { processAccession(bibRef, item); } else if (name.equals("CommentsCorrectionsList")) { NodeList jNodes = item.getChildNodes(); for (int q = 0; q < jNodes.getLength(); q++) { Node jitem = jNodes.item(q); if (!(jitem instanceof Element)) { continue; } Node reftype = jitem.getAttributes().getNamedItem("RefType"); if (reftype == null) continue; String reftypeName = ((Attr) reftype).getValue(); log.debug(reftypeName); if (reftypeName.equals("RetractionIn")) { try { XPathFactory xf = XPathFactory.newInstance(); XPath xpath = xf.newXPath(); XPathExpression xgds = xpath.compile("RefSource/text()"); String ref = (String) xgds.evaluate(jitem, XPathConstants.STRING); xgds = xpath.compile("PMID/text()"); String pmid = (String) xgds.evaluate(jitem, XPathConstants.STRING); String description = "Retracted [In: " + ref + " PMID=" + pmid + "]"; bibRef.setDescription(description); } catch (XPathExpressionException e) { log.warn("Error while trying to get details of the retraction: " + e.getMessage(), e); continue; } /* * Such papers also have <PublicationType>Retracted Publication</PublicationType> */ } } } } return article; } }