com.act.lcms.LCMSmzMLParser.java Source code

Introduction

Here is the source code for com.act.lcms.LCMSmzMLParser.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms;

import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathFactory;
import java.io.FileInputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.IOException;

/**
 * Parses mzXML files, converting the time points contained therein into {@link com.act.lcms.LCMSSpectrum} objects.
 *
 * mzXML has a few quirks that the user ought to be aware of:
 * <ul>
 *   <li>
 *     Each mzXML file may contain data for several kinds of scans, differentiated by their "function" value.  For
 *     the Waters instrument used by ECL, there may be three different scan functions; we currently are only interested
 *     in function 2.
 *   </li>
 *   <li>
 *     The mass/charge and intensity data for a given spectrum (time point) are stored as base64-encoded lists of
 *     little-endian IEEE 754 floating point numbers.  These lists are unpacked and zipped together by this parser.
 *   </li>
 *   <li>
 *     Each spectrum has a "base peak m/z" and "base peak intensity" value specified, which is the
 *     mass/charge with the highest intensity value (and that intensity value) at the current time point.  This
 *     {mass/charge, intensity} pair does <b>not</b> reappear in the spectrum data: it seems to be plucked out of the
 *     spectrum data.
 *   </li>
 * </ul>
 *
 * Note that the {@link #parse(String)} function for this class is a memory hog.  Use {@link #getIterator(String)}
 * wherever possible instead.
 */
public class LCMSmzMLParser extends MzMLParser<LCMSSpectrum> implements LCMSParser {

    // Paths for invariant checking.
    public static final String SPECTRUM_PATH_EXPECTED_VERSION = "/spectrum/cvParam[@name='MS1 spectrum']";
    public static final String SPECTRUM_PATH_EXPECTED_VERSION_DIODE_ARRAY = "/spectrum/cvParam[@name='electromagnetic radiation spectrum']";
    public static final String SPECTRUM_PATH_SCAN_LIST_COUNT = "/spectrum/scanList/@count";
    public static final String SPECTRUM_PATH_BINARY_DATA_ARRAY_LIST_COUNT = "/spectrum/binaryDataArrayList/@count";

    public LCMSmzMLParser() {
        super();
    }

    protected LCMSSpectrum handleSpectrumEntry(Document doc) throws XPathException {
        XPath xpath = getXPathFactory().newXPath();

        Double spectrumIndexD = (Double) xpath.evaluate(SPECTRUM_PATH_INDEX, doc, XPathConstants.NUMBER);
        if (spectrumIndexD == null) {
            System.err.format("WARNING: found spectrum document without index attribute.\n");
            return null;
        }
        Integer spectrumIndex = spectrumIndexD.intValue();

        if (xpath.evaluate(SPECTRUM_PATH_EXPECTED_VERSION, doc, XPathConstants.NODE) == null) {
            // if it is not MS1 Spectrum data then we will skip from the output.

            // check if it entry we see here is the diode array data, those we expect to silently skip
            // if on the other hand, even that is not matched; we truly have some unexpected entries, so report to user
            if (xpath.evaluate(SPECTRUM_PATH_EXPECTED_VERSION_DIODE_ARRAY, doc, XPathConstants.NODE) == null) {
                System.err.format(
                        "WARNING: found unexpected MS spectrum version in spectrum document %d.  Skipping.\n",
                        spectrumIndex);
            }

            return null;
        }

        String spectrumId = (String) xpath.evaluate(SPECTRUM_PATH_ID, doc, XPathConstants.STRING);
        if (spectrumId == null) {
            System.err.format("WARNING: no spectrum id found for documnt %d\n", spectrumIndex);
            return null;
        }

        Matcher matcher = SPECTRUM_EXTRACTION_REGEX.matcher(spectrumId);
        if (!matcher.find()) {
            System.err.format("WARNING: spectrum id for documnt %d did not match regex: %s\n", spectrumIndex,
                    spectrumId);
            return null;
        }
        Integer spectrumFunction = Integer.parseInt(matcher.group(1));
        Integer spectrumScan = Integer.parseInt(matcher.group(3));

        Integer scanListCount = ((Double) xpath.evaluate(SPECTRUM_PATH_SCAN_LIST_COUNT, doc, XPathConstants.NUMBER))
                .intValue();
        if (!Integer.valueOf(1).equals(scanListCount)) {
            System.err.format("WARNING: unexpected number of scan entries in spectrum document %d: %d",
                    spectrumIndex, scanListCount);
            return null;
        }

        Integer binaryDataCount = ((Double) xpath.evaluate(SPECTRUM_PATH_BINARY_DATA_ARRAY_LIST_COUNT, doc,
                XPathConstants.NUMBER)).intValue();
        if (!Integer.valueOf(2).equals(binaryDataCount)) {
            System.err.format("WARNING: unexpected number of binary data entries in spectrum document %d: %d",
                    spectrumIndex, binaryDataCount);
            return null;
        }

        Double basePeakMz = (Double) xpath.evaluate(SPECTRUM_PATH_BASE_PEAK_MZ, doc, XPathConstants.NUMBER);
        if (basePeakMz == null) {
            System.err.format("WARNING: no base peak m/z found for spectrum document %d\n", spectrumIndex);
            return null;
        }

        Double basePeakIntensity = (Double) xpath.evaluate(SPECTRUM_PATH_BASE_PEAK_INTENSITY, doc,
                XPathConstants.NUMBER);
        if (basePeakIntensity == null) {
            System.err.format("WARNING: no base peak intensity found for spectrum document %d\n", spectrumIndex);
            return null;
        }

        Double scanStartTime = (Double) xpath.evaluate(SPECTRUM_PATH_SCAN_START_TIME, doc, XPathConstants.NUMBER);
        if (scanStartTime == null) {
            System.err.format("WARNING: no scan start time found for spectrum document %d\n", spectrumIndex);
            return null;
        }

        String scanStartTimeUnit = (String) xpath.evaluate(SPECTRUM_PATH_SCAN_START_TIME_UNIT, doc,
                XPathConstants.STRING);
        if (scanStartTimeUnit == null) {
            System.err.format("WARNING: no scan start time unit found for spectrum document %d\n", spectrumIndex);
            return null;
        }

        String mzData = (String) xpath.evaluate(SPECTRUM_PATH_MZ_BINARY_DATA, doc, XPathConstants.STRING);
        if (mzData == null) {
            System.err.format("WARNING: no m/z data found for spectrum document %d\n", spectrumIndex);
            return null;
        }

        String intensityData = (String) xpath.evaluate(SPECTRUM_PATH_INTENSITY_BINARY_DATA, doc,
                XPathConstants.STRING);
        if (intensityData == null) {
            System.err.format("WARNING: no intensity data found for spectrum document %d\n", spectrumIndex);
            return null;
        }

        List<Double> mzs = base64ToDoubleList(mzData);
        List<Double> intensities = base64ToDoubleList(intensityData);
        List<Pair<Double, Double>> mzIntensityPairs = zipLists(mzs, intensities);

        return new LCMSSpectrum(spectrumIndex, scanStartTime, scanStartTimeUnit, mzIntensityPairs, basePeakMz,
                basePeakIntensity, spectrumFunction, spectrumScan, null);
    }

    @Override
    public List<LCMSSpectrum> parse(String inputFile)
            throws ParserConfigurationException, IOException, XMLStreamException {
        return super.parse(inputFile);
    }

    @Override
    public Iterator<LCMSSpectrum> getIterator(String inputFile)
            throws ParserConfigurationException, IOException, XMLStreamException {
        return super.getIterator(inputFile);
    }
}