com.act.lcms.MzMLParser.java Source code

Introduction

Here is the source code for com.act.lcms.MzMLParser.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms;

import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathFactory;
import java.io.FileInputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.IOException;

public abstract class MzMLParser<S> {
    public static final String SPECTRUM_OBJECT_TAG = "spectrum";
    public static final String XML_PREAMBLE = "<?xml version=\"1.0\" encoding=\"utf-8\"?>";

    // Paths for data extraction.
    public static final String SPECTRUM_PATH_INDEX = "/spectrum/@index";
    public static final String SPECTRUM_PATH_ID = "/spectrum/@id";
    public static final String SPECTRUM_PATH_BASE_PEAK_MZ = "/spectrum/cvParam[@name='base peak m/z']/@value";
    public static final String SPECTRUM_PATH_BASE_PEAK_INTENSITY = "/spectrum/cvParam[@name='base peak intensity']/@value";
    public static final String SPECTRUM_PATH_SCAN_START_TIME = "/spectrum/scanList/scan/cvParam[@name='scan start time']/@value";
    public static final String SPECTRUM_PATH_SCAN_START_TIME_UNIT = "/spectrum/scanList/scan/cvParam[@name='scan start time']/@unitName";
    public static final String SPECTRUM_PATH_MZ_BINARY_DATA = "/spectrum/binaryDataArrayList/binaryDataArray[./cvParam/@name='m/z array']/binary/text()";
    public static final String SPECTRUM_PATH_INTENSITY_BINARY_DATA = "/spectrum/binaryDataArrayList/binaryDataArray[./cvParam/@name='intensity array']/binary/text()";

    public static final Pattern SPECTRUM_EXTRACTION_REGEX = Pattern
            .compile("function=(\\d+) *process=(\\d+) scan=(\\d+)");

    // XPathFactory is known to be non-thread-safe.
    protected static final ThreadLocal<XPathFactory> XPATH_FACTORY = new ThreadLocal<XPathFactory>() {
        @Override
        protected XPathFactory initialValue() {
            return XPathFactory.newInstance();
        }
    };

    /**
     * Helper function: builds an XML DocumentBuilderFactory that can be used repeatedly in this class.
     * <p>
     * TODO: move this to an XML utility class, as I'm sure we'll use it again some day.
     *
     * @return An XML DocumentBuilderFactory.
     * @throws ParserConfigurationException
     */
    public static DocumentBuilderFactory mkDocBuilderFactory() throws ParserConfigurationException {
        /* This factory must be configured within the context of a method call for exception handling.
         * TODO: can we work around this w/ dependency injection? */
        // from http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        docFactory.setValidating(false);
        docFactory.setNamespaceAware(true);
        docFactory.setFeature("http://xml.org/sax/features/namespaces", false);
        docFactory.setFeature("http://xml.org/sax/features/validation", false);
        docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        return docFactory;
    }

    protected static List<Double> base64ToDoubleList(String b64) {
        byte[] decodedBytes = Base64.getDecoder().decode(b64);
        ByteBuffer buf = ByteBuffer.wrap(decodedBytes).order(ByteOrder.LITTLE_ENDIAN);
        List<Double> values = new ArrayList<>(decodedBytes.length / 8);
        while (buf.hasRemaining()) {
            values.add(buf.getDouble());
        }
        return values;
    }

    // TODO: isn't there some library method for this?  The Interwebs seem to say there isn't...
    protected static <K, V> List<Pair<K, V>> zipLists(List<K> keys, List<V> vals) {
        if (keys.size() != vals.size()) {
            throw new RuntimeException(String.format("Mismatched list sizes: %d vs %d", keys.size(), vals.size()));
        }
        List<Pair<K, V>> res = new ArrayList<>(keys.size());
        Iterator<K> ki = keys.listIterator();
        Iterator<V> vi = vals.listIterator();
        while (ki.hasNext() && vi.hasNext()) { // Length check should ensure these are exhausted simultaneously.
            K k = ki.next();
            V v = vi.next();
            res.add(Pair.of(k, v));
        }
        return res;
    }

    public MzMLParser() {
    }

    protected XPathFactory getXPathFactory() {
        return XPATH_FACTORY.get();
    }

    public Iterator<S> getIterator(String inputFile)
            throws ParserConfigurationException, IOException, XMLStreamException {
        DocumentBuilderFactory docFactory = mkDocBuilderFactory();
        DocumentBuilder docBuilder = docFactory.newDocumentBuilder();

        final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
        final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();

        return new Iterator<S>() {
            boolean inEntry = false;

            XMLEventReader xr = xmlInputFactory.createXMLEventReader(new FileInputStream(inputFile), "utf-8");
            // TODO: is the use of the XML version/encoding tag definitely necessary?
            StringWriter w = new StringWriter().append(XML_PREAMBLE).append("\n");
            XMLEventWriter xw = xmlOutputFactory.createXMLEventWriter(w);

            S next = null;

            /* Because we're handling the XML as a stream, we can only determine whether we have another Spectrum to return
             * by attempting to parse the next one.  `this.next()` reads
             */
            private S getNextSpectrum() {
                S spectrum = null;
                if (xr == null || !xr.hasNext()) {
                    return null;
                }

                try {
                    while (xr.hasNext()) {
                        XMLEvent e = xr.nextEvent();
                        if (!inEntry && e.isStartElement()
                                && e.asStartElement().getName().getLocalPart().equals((SPECTRUM_OBJECT_TAG))) {
                            xw.add(e);
                            inEntry = true;
                        } else if (e.isEndElement()
                                && e.asEndElement().getName().getLocalPart().equals(SPECTRUM_OBJECT_TAG)) {
                            xw.add(e);
                            xw.flush();
                            /* TODO: the XMLOutputFactory docs don't make it clear if/how events can be written directly into a new
                             * document structure, so we incur the cost of extracting each spectrum entry, serializing it, and
                             * re-reading it into its own document so it can be handled by XPath.  Master this strange corner of the
                             * Java ecosystem and get rid of <></>his doc -> string -> doc conversion. */
                            Document doc = docBuilder.parse(new ReaderInputStream(new StringReader(w.toString())));
                            spectrum = handleSpectrumEntry(doc);
                            xw.close();
                            /* Note: this can also be accomplished with `w.getBuffer().setLength(0);`, but using a new event writer
                             * seems safer. */
                            w = new StringWriter();
                            w.append(XML_PREAMBLE).append("\n");
                            xw = xmlOutputFactory.createXMLEventWriter(w);
                            inEntry = false;
                            // Don't stop parsing if handleSpectrumEntry didn't like this spectrum document.
                            if (spectrum != null) {
                                break;
                            }
                        } else if (inEntry) {
                            // Add this element if we're in an entry
                            xw.add(e);
                        }
                    }

                    // We've reached the end of the document; close the reader to show that we're done.
                    if (!xr.hasNext()) {
                        xr.close();
                        xr = null;
                    }
                } catch (Exception e) {
                    // TODO: do better.  We seem to run into this sort of thing with Iterators a lot...
                    throw new RuntimeException(e);
                }

                return spectrum;
            }

            private S tryParseNext() {
                // Fail the attempt if the reader is closed.
                if (xr == null || !xr.hasNext()) {
                    return null;
                }

                // No checks on whether we already have a spectrum stored: we expect the callers to do that.
                return getNextSpectrum();
            }

            @Override
            public boolean hasNext() {
                // Prime the pump if the iterator doesn't have a value stored yet.
                if (this.next == null) {
                    this.next = tryParseNext();
                }

                // If we have an entry waiting, return true; otherwise read the next entry and return true if successful.
                return this.next != null;
            }

            @Override
            public S next() {
                // Prime the pump like we do in hasNext().
                if (this.next == null) {
                    this.next = tryParseNext();
                }

                // Take available spectrum and return it.
                S res = this.next;
                /* Advance to the next element immediately, making next() do the heavy lifting most of the time.  Otherwise,
                 * the parsing will resume on hasNext(), which seems like it ought to be a light-weight operation. */
                this.next = tryParseNext();

                return res;
            }

        };
    }

    public List<S> parse(String inputFile) throws ParserConfigurationException, IOException, XMLStreamException {
        List<S> spectra = new ArrayList<>();
        Iterator<S> iter = this.getIterator(inputFile);
        while (iter.hasNext()) {
            spectra.add(iter.next());
        }

        return spectra;
    }

    protected abstract S handleSpectrumEntry(Document doc) throws XPathException;
}