ubic.gemma.loader.entrez.pubmed.PubMedXMLParser.java Source code

Introduction

Here is the source code for ubic.gemma.loader.entrez.pubmed.PubMedXMLParser.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.entrez.pubmed;

import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.ParseException;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import ubic.gemma.model.common.description.BibliographicReference;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.common.description.Keyword;
import ubic.gemma.model.common.description.MedicalSubjectHeading;
import ubic.gemma.model.common.description.PublicationType;
import ubic.gemma.model.expression.biomaterial.Compound;

/**
 * Simple class to parse XML in the format defined by {@link http
 * ://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_041101.dtd}. The resulting BibliographicReference object is
 * associated with (transient) DatabaseEntry, in turn to a (transient) ExternalDatabase and MeSH.
 * 
 * @author pavlidis
 * @version $Id: PubMedXMLParser.java,v 1.24 2013/05/29 04:11:29 paul Exp $
 */
public class PubMedXMLParser {

    private static final String ERROR_TAG = "Error";
    private static final String PUB_MED_EXTERNAL_DB_NAME = "PubMed";
    protected static final Log log = LogFactory.getLog(PubMedXMLParser.class);

    DocumentBuilder builder;

    DateFormat df = DateFormat.getDateInstance(DateFormat.MEDIUM);

    private final String[] formats = new String[] { "MMM dd, yyyy", "yyyy" };

    /**
     * @param bibRef
     * @param item
     * @throws IOException
     */
    public void extractBookPublicationYear(BibliographicReference bibRef, Node item) throws IOException {
        NodeList c = item.getChildNodes();
        for (int i = 0; i < c.getLength(); i++) {
            Node a = c.item(i);
            if (!(a instanceof Element)) {
                continue;
            }
            if (a.getNodeName().equals("Year")) {
                try {
                    bibRef.setPublicationDate(DateUtils.parseDate(XMLUtils.getTextValue((Element) a), formats));
                } catch (ParseException e) {
                    log.warn("Could not extract date of publication from : " + XMLUtils.getTextValue((Element) a));
                }
            }
        }
    }

    /**
     * @param bibRef
     * @param item
     * @throws IOException
     */
    public void extractPublisher(BibliographicReference bibRef, Node item) throws IOException {
        NodeList c = item.getChildNodes();
        for (int i = 0; i < c.getLength(); i++) {
            Node a = c.item(i);
            if (!(a instanceof Element)) {
                continue;
            }
            if (a.getNodeName().equals("PublisherName")) {
                bibRef.setPublisher(XMLUtils.getTextValue((Element) a));
            } else if (a.getNodeName().equals("PublisherLocation")) {
                bibRef.setPublisher(bibRef.getPublisher() + " [ " + XMLUtils.getTextValue((Element) a) + "]");
            }
        }
    }

    /**
     * @param is
     * @return
     */
    public Collection<BibliographicReference> parse(InputStream is) {

        try {
            if (is.available() == 0) {
                throw new IOException("XML stream contains no data.");
            }

            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setIgnoringComments(true);
            factory.setValidating(false);
            builder = factory.newDocumentBuilder();
            Document document = builder.parse(is);

            log.debug("done parsing");
            return extractBibRefs(document);
        } catch (IOException e) {
            throw new RuntimeException(e);
        } catch (ParserConfigurationException e) {
            throw new RuntimeException(e);
        } catch (SAXException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @param doc
     * @return
     * @throws IOException
     */
    private String extractAuthorList(NodeList authorList) throws IOException {
        StringBuilder al = new StringBuilder();
        for (int i = 0; i < authorList.getLength(); i++) {
            Node item = authorList.item(i);
            if (!(item instanceof Element)) {
                continue;
            }
            NodeList nl = item.getChildNodes();
            for (int j = 0; j < nl.getLength(); j++) {

                Node m = nl.item(j);

                if (m instanceof Element) {

                    Element f = (Element) m;
                    String nodeName = f.getNodeName();
                    if (nodeName.equals("LastName")) {
                        al.append(XMLUtils.getTextValue(f));
                        al.append(", ");
                    } else if (nodeName.equals("ForeName")) {
                        al.append(XMLUtils.getTextValue(f));

                        al.append("; ");

                    } else if (nodeName.equals("Initials")) {
                        // noop ;
                    } else if (nodeName.equals("CollectiveName")) {
                        al.append(XMLUtils.getTextValue(f));
                        al.append("; ");
                    }
                }
            }
        }

        if (al.length() == 0)
            return "(No authors listed)";
        if (al.length() < 3)
            return al.toString();
        return al.toString().substring(0, al.length() - 2); // trim trailing semicolon + space.
    }

    /**
     * @param doc
     * @return
     * @throws IOException
     */
    private Collection<BibliographicReference> extractBibRefs(Document document) throws IOException {

        // Was there an error? (not found)
        if (document.getElementsByTagName(ERROR_TAG).getLength() > 0) {
            return null;
        }

        NodeList articles = document.getElementsByTagName("MedlineCitation");

        if (articles.getLength() == 0) {
            // mebbe it is a book?
            articles = document.getElementsByTagName("BookDocument");
            if (articles.getLength() > 0) {
                return parseBookArticles(articles);
            }
            return new HashSet<BibliographicReference>();
        }

        Collection<BibliographicReference> result = new HashSet<BibliographicReference>();
        log.debug(articles.getLength() + " articles found in document");

        int i = 0;
        for (; i < articles.getLength(); i++) {
            BibliographicReference bibRef = BibliographicReference.Factory.newInstance();
            Node record = articles.item(i);

            Node article = processRecord(bibRef, record);

            assert article != null;

            Node journal = processArticle(bibRef, article);

            processJournalInfo(bibRef, journal);

            result.add(bibRef);

            if (i > 0 && i % 1000 == 0) {
                log.info("Processed " + i + " articles");
            }
        }
        log.info("Processed " + i + " articles");

        return result;
    }

    /**
     * @param chemNodes
     * @return
     * @throws TransformerException
     * @throws IOException
     */
    private Collection<Compound> extractChemicals(Node chemNodes) throws IOException {
        Collection<Compound> compounds = new HashSet<Compound>();
        NodeList childNodes = chemNodes.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node chemNode = childNodes.item(i);
            NodeList termNodes = chemNode.getChildNodes();
            if (termNodes.getLength() == 0)
                continue;

            Compound c = Compound.Factory.newInstance();
            for (int j = 0; j < termNodes.getLength(); j++) {
                Node item = termNodes.item(j);
                if (!(item instanceof Element)) {
                    continue;
                }
                Element el = (Element) item;
                if (el.getNodeName().equals("RegistryNumber")) {
                    String regString = XMLUtils.getTextValue(el);
                    c.setRegistryNumber(regString);
                } else {
                    String txt = XMLUtils.getTextValue(el);
                    c.setName(txt);
                }
            }
            log.debug(c.getName());
            compounds.add(c);
        }

        return compounds;
    }

    /**
     * @param dateNode
     * @return
     * @throws TransformerException
     * @throws IOException
     */
    private Date extractJournalIssueDate(Node dateNode) throws IOException {

        String yearText = null;// = XMLUtils.getTextValue( ( Element ) y );
        String medLineText = null;// = XMLUtils.getTextValue( ( Element ) medLineDate );
        String monthText = null;// = XMLUtils.getTextValue( ( Element ) m );
        String dayText = null;// = XMLUtils.getTextValue( ( Element ) dn );

        NodeList childNodes = dateNode.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node c = childNodes.item(i);
            if (!(c instanceof Element)) {
                continue;
            }
            String t = XMLUtils.getTextValue((Element) c);
            if (c.getNodeName().equals("Year")) {
                yearText = t;
            } else if (c.getNodeName().equals("Month")) {
                monthText = t;
            } else if (c.getNodeName().equals("Day")) {
                dayText = t;
            } else if (c.getNodeName().equals("MedlineDate")) {
                medLineText = t;
            }
        }

        df.setLenient(true);

        if (yearText == null && medLineText != null) {
            String[] yearmo = medLineText.split("\\s");
            if (yearmo.length == 2) {
                // 1983 Aug
                yearText = yearmo[0];
                monthText = yearmo[1];
                monthText = monthText.replaceAll("-\\w+", "");
            } else if (yearmo.length == 4) {
                // 1983 Aug 31-Sep 6
                yearText = yearmo[0];
                monthText = yearmo[1];
                dayText = yearmo[2].replaceAll("-\\w+", "");
            } else if (yearmo.length == 3) {
                // 1983 Jul 9-16
                yearText = yearmo[0];
                monthText = yearmo[1];
                dayText = yearmo[2].replaceAll("-\\w+", "");
            } else if (yearmo.length == 1) {
                // 1983-84
                yearText = yearmo[0];
                yearText = yearText.replaceAll("-\\w+", "");
            } else {
                log.warn("No data information from medline text: " + medLineText);
            }
        }

        if (monthText == null) {
            monthText = "Jan"; // arbitrary...
        }

        String dateString = monthText + " " + (dayText == null ? "1" : dayText) + ", " + yearText;

        try {
            return DateUtils.parseDate(dateString, formats);
        } catch (ParseException e) {
            log.warn("Could not parse date " + dateString + " from medlinetext=" + medLineText);
            return null;
        }
    }

    /**
     * @param keywordNode
     * @return
     * @throws TransformerException
     * @throws IOException
     */
    private Collection<Keyword> extractKeywords(Node keywordNode) throws IOException {
        Collection<Keyword> keywords = new HashSet<Keyword>();
        NodeList childNodes = keywordNode.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (!(item instanceof Element)) {
                continue;
            }
            Element el = (Element) item;
            String keyword = XMLUtils.getTextValue(el);
            Boolean isMajor = isMajorHeading(item);
            Keyword kw = Keyword.Factory.newInstance();
            kw.setTerm(keyword);
            kw.setIsMajorTopic(isMajor);
            keywords.add(kw);
        }
        return keywords;
    }

    /**
     * Get the date this was put in pubmed.
     * 
     * @param dateNode
     * @return
     * @throws IOException
     */
    private Date extractPublicationDate(Node dateNode) throws IOException {
        Date d = extractJournalIssueDate(dateNode);
        // if ( d == null ) d = extractPubmedPubdate( dateNode );
        return d;
    }

    /**
     * @param article
     * @return
     * @throws TransformerException
     * @throws IOException
     */
    private Collection<PublicationType> extractPublicationTypes(Node pubtypeList) throws IOException {
        Collection<PublicationType> publicationTypes = new HashSet<PublicationType>();
        NodeList childNodes = pubtypeList.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (!(item instanceof Element)) {
                continue;
            }
            String type = XMLUtils.getTextValue((Element) item);
            PublicationType pt = PublicationType.Factory.newInstance();
            pt.setType(type);
            publicationTypes.add(pt);
        }
        return publicationTypes;
    }

    private boolean isMajorHeading(Node descriptor) {
        Attr dmajorTopic = (Attr) descriptor.getAttributes().getNamedItem("MajorTopicYN");
        return dmajorTopic.getValue().equals("Y");
    }

    /**
     * @param articles
     * @return
     */
    private Collection<BibliographicReference> parseBookArticles(NodeList articles) throws IOException {
        Collection<BibliographicReference> result = new HashSet<BibliographicReference>();
        int i = 0;
        for (; i < articles.getLength(); i++) {
            BibliographicReference bibRef = BibliographicReference.Factory.newInstance();
            Node record = articles.item(i);

            processBookRecord(bibRef, record);

            result.add(bibRef);

            if (i > 0 && i % 1000 == 0) {
                log.info("Processed " + i + " books");
            }
        }
        log.info("Processed " + i + " books");
        return result;
    }

    private void processAccession(BibliographicReference bibRef, Node record) throws IOException {
        String accession = XMLUtils.getTextValue((Element) record);
        DatabaseEntry dbEntry = DatabaseEntry.Factory.newInstance();
        dbEntry.setAccession(accession);
        ExternalDatabase exDb = ExternalDatabase.Factory.newInstance();
        exDb.setName(PUB_MED_EXTERNAL_DB_NAME);
        dbEntry.setExternalDatabase(exDb);
        bibRef.setPubAccession(dbEntry);
    }

    private Node processArticle(BibliographicReference bibRef, Node article) throws IOException {
        NodeList childNodes = article.getChildNodes();
        Node journal = null;
        for (int j = 0; j < childNodes.getLength(); j++) {
            Node item = childNodes.item(j);
            if (!(item instanceof Element)) {
                continue;
            }
            String name = item.getNodeName();
            if (name.equals("ArticleTitle")) {
                bibRef.setTitle(XMLUtils.getTextValue((Element) item));
            } else if (name.equals("Journal")) {
                journal = item;
            } else if (name.equals("AuthorList")) {
                bibRef.setAuthorList(extractAuthorList(item.getChildNodes()));
            } else if (name.equals("Pagination")) {
                bibRef.setPages(XMLUtils.extractOneChild(item, "MedlinePgn"));
            } else if (name.equals("Abstract")) {
                // abstracts can have parts
                List<String> abstractParts = XMLUtils.extractMultipleChildren(item, "AbstractText");

                if (abstractParts.size() > 1) {
                    StringBuilder buf = new StringBuilder();
                    NodeList jNodes = item.getChildNodes();
                    for (int q = 0; q < jNodes.getLength(); q++) {
                        Node jitem = jNodes.item(q);
                        if (!(jitem instanceof Element)) {
                            continue;
                        }
                        if (jitem.getNodeName().equals("AbstractText")) {
                            String label = jitem.getAttributes().getNamedItem("Label").getTextContent();
                            String part = jitem.getTextContent();
                            if (StringUtils.isNotBlank(label)) {
                                buf.append(label + ": " + part + "\n");
                            } else {
                                buf.append(part + "\n");
                            }
                        }
                    }
                    bibRef.setAbstractText(buf.toString());
                } else {
                    bibRef.setAbstractText(abstractParts.iterator().next());
                }
            } else if (name.equals("PublicationTypeList")) {
                bibRef.setPublicationTypes(extractPublicationTypes(item));
            }
        }
        return journal;
    }

    private void processBookInfo(BibliographicReference bibRef, Node article) throws IOException {
        NodeList childNodes = article.getChildNodes();

        for (int j = 0; j < childNodes.getLength(); j++) {
            Node item = childNodes.item(j);
            if (!(item instanceof Element)) {
                continue;
            }
            String name = item.getNodeName();
            if (name.equals("Publisher")) {
                extractPublisher(bibRef, item);
            } else if (name.equals("PubDate") && bibRef.getPublicationDate() == null) {
                extractBookPublicationYear(bibRef, item);
            } else if (name.equals("AuthorList")) {
                if (((Element) item).hasAttribute("Type")) {
                    if (((Element) item).getAttribute("Type").equals("editors")) {
                        bibRef.setEditor(extractAuthorList(item.getChildNodes()));
                    } else {
                        bibRef.setAuthorList(extractAuthorList(item.getChildNodes()));
                    }
                }
            } else if (name.equals("BookTitle")) {
                if (bibRef.getTitle() == null)
                    bibRef.setTitle(XMLUtils.getTextValue((Element) item));
                bibRef.setPublication(XMLUtils.getTextValue((Element) item));
            }
        }
    }

    /**
     * Fill in information about the book: Publisher, Editor(s), Publication year
     * 
     * @param bibRef
     * @param record
     * @return
     * @throws IOException
     */
    private void processBookRecord(BibliographicReference bibRef, Node record) throws IOException {

        NodeList recordNodes = record.getChildNodes();
        for (int p = 0; p < recordNodes.getLength(); p++) {
            Node item = recordNodes.item(p);
            if (!(item instanceof Element)) {
                continue;
            }

            String name = item.getNodeName();
            if (name.equals("ArticleTitle")) {
                // this is the title of the chapter.
                bibRef.setTitle(StringUtils.strip(XMLUtils.getTextValue((Element) item)));
            } else if (name.equals("Book")) {
                processBookInfo(bibRef, item);
            } else if (name.equals("AuthorList")) {
                bibRef.setAuthorList(extractAuthorList(item.getChildNodes()));
            } else if (name.equals("Abstract")) {
                bibRef.setAbstractText("");
                NodeList abstractTextSections = item.getChildNodes();
                for (int q = 0; q < abstractTextSections.getLength(); q++) {
                    Node jitem = abstractTextSections.item(q);
                    if (!(jitem instanceof Element)) {
                        continue;
                    }
                    if (jitem.getNodeName().equals("AbstractText")) {
                        bibRef.setAbstractText(
                                bibRef.getAbstractText() + (XMLUtils.getTextValue((Element) jitem)) + " ");
                    }

                    bibRef.setAbstractText(bibRef.getAbstractText().trim());
                }
            } else if (name.equals("PMID")) {
                processAccession(bibRef, item);
            } else if (name.equals("ContributionDate")) {
                /*
                 * Unusual, but happens for books that are updated with new sections. We use this instead of the
                 * publication date.
                 */
                extractBookPublicationYear(bibRef, item);
            }
        }

    }

    /**
     * @param bibRef
     * @param journal
     * @return
     * @throws IOException
     */
    private NodeList processJournalInfo(BibliographicReference bibRef, Node journal) throws IOException {
        NodeList journalNodes = journal.getChildNodes();
        for (int j = 0; j < journalNodes.getLength(); j++) {
            Node item = journalNodes.item(j);
            if (!(item instanceof Element)) {
                continue;
            }
            String name = item.getNodeName();
            if (name.equals("JournalIssue")) {
                NodeList journalIssueNodes = item.getChildNodes();
                for (int k = 0; k < journalIssueNodes.getLength(); k++) {
                    Node jitem = journalIssueNodes.item(k);
                    if (!(jitem instanceof Element)) {
                        continue;
                    }
                    String jname = jitem.getNodeName();
                    if (jname.equals("Volume")) {
                        bibRef.setVolume(XMLUtils.getTextValue((Element) jitem));
                    } else if (jname.equals("Issue")) {
                        bibRef.setIssue(XMLUtils.getTextValue((Element) jitem));
                    } else if (jname.equals("PubDate")) {
                        bibRef.setPublicationDate(extractPublicationDate(jitem));
                    }
                }
            }
        }
        return journalNodes;
    }

    /**
     * @param meshHeadings
     * @param bibRef
     * @throws TransformerException
     * @throws IOException
     */
    private void processMESH(Node meshHeadings, BibliographicReference bibRef) throws IOException {
        NodeList childNodes = meshHeadings.getChildNodes();

        for (int i = 0; i < childNodes.getLength(); i++) {
            Node meshNode = childNodes.item(i);
            NodeList termNodes = meshNode.getChildNodes();
            MedicalSubjectHeading vc = MedicalSubjectHeading.Factory.newInstance();

            if (termNodes.getLength() == 0)
                continue;

            // these might just be a single Descriptor or a Descriptor with Qualifiers.
            for (int j = 0; j < termNodes.getLength(); j++) {

                Node item = termNodes.item(j);
                if (!(item instanceof Element)) {
                    continue;
                }
                Element descriptor = (Element) item;
                if (descriptor.getNodeName().equals("DescriptorName")) {
                    String d = XMLUtils.getTextValue(descriptor);
                    boolean dmajorB = isMajorHeading(descriptor);
                    vc.setTerm(d);
                    vc.setIsMajorTopic(dmajorB);
                } else {
                    MedicalSubjectHeading qual = MedicalSubjectHeading.Factory.newInstance();
                    String q = XMLUtils.getTextValue(descriptor);
                    boolean qmajorB = isMajorHeading(descriptor);
                    qual.setIsMajorTopic(qmajorB);
                    qual.setTerm(q);
                    vc.getQualifiers().add(qual);
                }

            }

            bibRef.getMeshTerms().add(vc);
        }
    }

    private Node processRecord(BibliographicReference bibRef, Node record) throws IOException {
        Node article = null;

        NodeList recordNodes = record.getChildNodes();
        for (int p = 0; p < recordNodes.getLength(); p++) {
            Node item = recordNodes.item(p);
            if (!(item instanceof Element)) {
                continue;
            }
            String name = item.getNodeName();
            if (name.equals("Article")) {
                article = item;
            } else if (name.equals("ChemicalList")) {
                bibRef.setChemicals(extractChemicals(item));
            } else if (name.equals("MeshHeadingList")) {
                processMESH(item, bibRef);
            } else if (name.equals("KeywordList")) {
                bibRef.setKeywords(extractKeywords(item));
            } else if (name.equals("MedlineJournalInfo")) {
                NodeList jNodes = item.getChildNodes();
                for (int q = 0; q < jNodes.getLength(); q++) {
                    Node jitem = jNodes.item(q);
                    if (!(jitem instanceof Element)) {
                        continue;
                    }
                    if (jitem.getNodeName().equals("MedlineTA")) {
                        bibRef.setPublication(XMLUtils.getTextValue((Element) jitem));
                    }
                }
            } else if (name.equals("PMID")) {
                processAccession(bibRef, item);
            } else if (name.equals("CommentsCorrectionsList")) {

                NodeList jNodes = item.getChildNodes();
                for (int q = 0; q < jNodes.getLength(); q++) {
                    Node jitem = jNodes.item(q);
                    if (!(jitem instanceof Element)) {
                        continue;
                    }
                    Node reftype = jitem.getAttributes().getNamedItem("RefType");

                    if (reftype == null)
                        continue;

                    String reftypeName = ((Attr) reftype).getValue();
                    log.debug(reftypeName);
                    if (reftypeName.equals("RetractionIn")) {

                        try {
                            XPathFactory xf = XPathFactory.newInstance();
                            XPath xpath = xf.newXPath();
                            XPathExpression xgds = xpath.compile("RefSource/text()");
                            String ref = (String) xgds.evaluate(jitem, XPathConstants.STRING);

                            xgds = xpath.compile("PMID/text()");
                            String pmid = (String) xgds.evaluate(jitem, XPathConstants.STRING);

                            String description = "Retracted [In: " + ref + " PMID=" + pmid + "]";
                            bibRef.setDescription(description);
                        } catch (XPathExpressionException e) {
                            log.warn("Error while trying to get details of the retraction: " + e.getMessage(), e);
                            continue;
                        }
                        /*
                         * Such papers also have <PublicationType>Retracted Publication</PublicationType>
                         */
                    }

                }

            }
        }
        return article;
    }

}