act.installer.pubchem.PubchemParser.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.pubchem.PubchemParser.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.pubchem;

import act.server.MongoDB;
import act.shared.Chemical;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jaxen.JaxenException;
import org.jaxen.dom.DOMXPath;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.XMLEvent;
import javax.xml.xpath.XPathExpressionException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

public class PubchemParser {

    private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemParser.class);
    private static final String OPTION_DATA_DIRECTORY = "i";
    private static final String OPTION_DB = "o";
    private static final String GZIP_FILE_EXT = ".gz";
    private static final String COMPOUND_DOC_TAG = "PC-Compound";

    private static final int GZIP_BUFFER_SIZE = 1 << 27; // ~128MB of buffer space to help GZip really move.

    private static final boolean ENABLE_XML_STREAM_TEXT_COALESCING = true;

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final String HELP_MESSAGE = "This class is used for parsing xml files and storing them in a db";

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DATA_DIRECTORY).argName("OPTION_DATA_DIRECTORY")
                    .desc("The data directory where the pubchem files live").hasArg().required()
                    .type(String.class));
            add(Option.builder(OPTION_DB).argName("OPTION_DB").desc("The db to save the results in").hasArg()
                    .required().type(String.class));
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };

    /**
     * Key terminologies for this file:
     *
     * a) An example of how an inchi is packaged in the XML document
     *
     <PC-InfoData>
      <PC-InfoData_urn>
        <PC-Urn>
    <PC-Urn_label>InChI</PC-Urn_label>
    <PC-Urn_name>Standard</PC-Urn_name>
    <PC-Urn_datatype>
      <PC-UrnDataType value="string">1</PC-UrnDataType>
    </PC-Urn_datatype>
    <PC-Urn_version>1.0.4</PC-Urn_version>
    <PC-Urn_software>InChI</PC-Urn_software>
    <PC-Urn_source>iupac.org</PC-Urn_source>
    <PC-Urn_release>2012.11.26</PC-Urn_release>
        </PC-Urn>
      </PC-InfoData_urn>
      <PC-InfoData_value>
        <PC-InfoData_value_sval>InChI=1S/C12H17FO/c1-12(2,3)10-6-4-9(5-7-10)11(13)8-14/h4-7,11,14H,8H2,1-3H3</PC-InfoData_value_sval>
      </PC-InfoData_value>
     </PC-InfoData>
     *
     * In order to parse the inchi, we first detect the element <PC-Urn_label>InChI</PC-Urn_label>, since it tells us this
     * node is an inchi node. The element PC-Urn_label is called a "ResourceName". The term "inchi", a value of the resource
     * name node, is called a "ResourceValue". For element <PC-Urn_label>InChI</PC-Urn_label>, two xml events occurs in sequence:
     * 1) START ELEMENT 2) CHARACTERS. We identify a ResourceName in (1) and ResourceValue in (2). In this case, we have two
     * nodes of interest:
     * a) <PC-Urn_label>InChI</PC-Urn_label>
     * b) <PC-InfoData_value_sval>InChI=1S/C12H17FO/c1-12(2,3)10-6-4-9(5-7-10)11(13)8-14/h4-7,11,14H,8H2,1-3H3</PC-InfoData_value_sval>
     *
     * Once we detect the node as an inchi node from a), we can easily detect the ResourceName "PC-InfoData_value_sval" and
     * store it's value for b). We call PC-Urn_label "PUBCHEM_KEY" since it is a key to the value of the inchi, whereas
     * PC-InfoData_value_sval is the "PUBCHEM_VALUE" since it is the value to the inchi.
     */

    private enum PC_XPATHS {
        /* Structure: <feature name>_<level>_[_<sub-feature or structure>]_<type>
         * [IUPAC_NAME]_[L1]_[NODES]: nodes in the original document (L1) that correspond to IUPAC name entries.
         * [IUPAC_NAME]_[L2]_[VALUE]_[TEXT]: textual names in the IUPAC name sub-tree (L2).
         */
        IUPAC_NAME_L1_NODES(
                "/PC-Compound/PC-Compound_props/PC-InfoData[./PC-InfoData_urn/PC-Urn/PC-Urn_label/text()=\"IUPAC Name\"]"), IUPAC_NAME_L2_TYPE_TEXT(
                        "/PC-InfoData/PC-InfoData_urn/PC-Urn/PC-Urn_name/text()"), IUPAC_NAME_L2_VALUE_TEXT(
                                "/PC-InfoData/PC-InfoData_value/PC-InfoData_value_sval/text()"),

        // TODO: ensure there is exactly one id_cid per compound.
        PC_ID_L1_TEXT(
                "/PC-Compound/PC-Compound_id/PC-CompoundType/PC-CompoundType_id/PC-CompoundType_id_cid/text()"),

        INCHI_L1_NODES(
                "/PC-Compound/PC-Compound_props/PC-InfoData[./PC-InfoData_urn/PC-Urn/PC-Urn_label/text()=\"InChI\"]"),
        /* We could just use //PC-InfoData[./PC-InfoData_urn//PC-Urn_label/text()="InChI"]//PC-InfoData_value_sval/text()
         * but we split the InChI parsing into two pieces in case there are multiple InChI entries (which would be insane).
         */
        INCHI_L2_TEXT("/PC-InfoData/PC-InfoData_value/PC-InfoData_value_sval/text()"),

        SMILES_L1_NODES("//PC-InfoData[./PC-InfoData_urn//PC-Urn_label/text()=\"SMILES\"]"), SMILES_L2_TEXT(
                "//PC-InfoData_value_sval/text()"),;

        private String path;

        PC_XPATHS(String path) {
            this.path = path;
        }

        public String getPath() {
            return path;
        }

        DOMXPath compile() throws JaxenException {
            return new DOMXPath(this.getPath());
        }
    }

    private static final Set<String> TAGS_TO_INCLUDE = Collections.unmodifiableSet(new HashSet<String>() {
        {
            add("PC-Compound_id");
            add("PC-Compound_props");
        }
    });

    private final Map<PC_XPATHS, DOMXPath> xpaths = new HashMap<>(PC_XPATHS.values().length);

    private MongoDB db;
    private DocumentBuilder documentBuilder;
    private XMLInputFactory xmlInputFactory;

    public PubchemParser(MongoDB db) {
        this.db = db;
    }

    /**
     * Initializes a PubchemParser.  Must be called before the PubchemParser can be used.
     * @throws XPathExpressionException
     * @throws ParserConfigurationException
     */
    public void init() throws ParserConfigurationException, JaxenException {
        // Would rather do this in its own block, but have to handle the XPath exception. :(
        for (PC_XPATHS x : PC_XPATHS.values()) {
            xpaths.put(x, x.compile());
        }

        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        documentBuilder = factory.newDocumentBuilder();

        xmlInputFactory = XMLInputFactory.newInstance();

        /* Configure the XMLInputFactory to return event streams that coalesce consecutive character events.  Without this
         * we can end up with malformed names and InChIs, as XPath will only fetch the first text node if there are several
         * text children under one parent. */
        xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, ENABLE_XML_STREAM_TEXT_COALESCING);
        if ((Boolean) xmlInputFactory.getProperty(XMLInputFactory.IS_COALESCING)) {
            LOGGER.info("Successfully configured XML stream to coalesce character elements.");
        } else {
            LOGGER.error("Unable to configure XML stream to coalesce character elements.");
        }
    }

    /**
     * This function writes a chemical record to the DB.
     * @param chemical Chemical to be written to the DB.
     */
    private void writeChemicalToDB(Chemical chemical) {
        Long id = db.getNextAvailableChemicalDBid();
        db.submitToActChemicalDB(chemical, id);
    }

    /**
     * Extracts compound features from a sub-document/sub-tree containing one PC-Compound element.  Nodes that contain
     * interesting features are found and their text extracted using XPath.
     *
     * @param d The document from which to extract features.
     * @return A PubchemEntry object corresponding to features from one PC-Compound document.
     * @throws XPathExpressionException
     */
    private PubchemEntry extractPCCompoundFeatures(Document d) throws JaxenException {
        Long id = Long.valueOf(xpaths.get(PC_XPATHS.PC_ID_L1_TEXT).stringValueOf(d));
        PubchemEntry entry = new PubchemEntry(id);

        // Jaxen's API is from a pre-generics age!
        List<Node> nodes = (List<Node>) xpaths.get(PC_XPATHS.IUPAC_NAME_L1_NODES).selectNodes(d);
        if (nodes.size() == 0) {
            LOGGER.warn("No names available for compound %d", id);
        }
        for (Node n : nodes) {
            /* In order to run XPath on a sub-document, we have to Extract the relevant nodes into their own document object.
             * If we try to run evaluate on `n` instead of this new document, we'll get matching paths for the original
             * document `d` but not for the nodes we're looking at right now.  Very weird.
             * TODO: remember this way of running XPath on documents the next time we need to write an XML parser. */
            Document iupacNameDoc = documentBuilder.newDocument();
            iupacNameDoc.adoptNode(n);
            iupacNameDoc.appendChild(n);
            String type = xpaths.get(PC_XPATHS.IUPAC_NAME_L2_TYPE_TEXT).stringValueOf(iupacNameDoc);
            String value = xpaths.get(PC_XPATHS.IUPAC_NAME_L2_VALUE_TEXT).stringValueOf(iupacNameDoc);
            entry.setNameByType(type, value);
        }

        // We really need an InChI for a chemical to make sense, so log errors if we can't find one.
        boolean hasInChI = false;
        nodes = xpaths.get(PC_XPATHS.INCHI_L1_NODES).selectNodes(d);
        if (nodes.size() == 0) {
            LOGGER.warn("Found chemical (%d) with no InChIs, hoping for SMILES instead", id);
        } else if (nodes.size() > 1) {
            LOGGER.error("Assumption violation: found chemical with multiple InChIs (%d), skipping", id);
            return null;
        } else {
            hasInChI = true;
            Node n = nodes.get(0);
            Document inchiDoc = documentBuilder.newDocument();
            inchiDoc.adoptNode(n);
            inchiDoc.appendChild(n);
            String value = xpaths.get(PC_XPATHS.INCHI_L2_TEXT).stringValueOf(inchiDoc);
            entry.setInchi(value);
        }

        nodes = xpaths.get(PC_XPATHS.SMILES_L1_NODES).selectNodes(d);
        if (nodes.size() == 0) {
            if (hasInChI) {
                LOGGER.warn("Found chemical (%d) with no SMILES, using only InChI");
            } else {
                LOGGER.warn("Found chemical (%d) with no InChI or SMILES, skipping");
                return null;
            }
        } else {
            for (Node n : nodes) {
                Document smilesDoc = documentBuilder.newDocument();
                smilesDoc.adoptNode(n);
                smilesDoc.appendChild(n);
                String smiles = xpaths.get(PC_XPATHS.SMILES_L2_TEXT).stringValueOf(smilesDoc);
                entry.appendSmiles(smiles);
            }
        }

        return entry;
    }

    /**
     * Incrementally parses a stream of XML events from a PubChem file, extracting the next available PC-Compound entry
     * as a Chemical object.
     * @param eventReader The xml event reader we are parsing the XML from
     * @return The constructed chemical
     * @throws XMLStreamException
     * @throws XPathExpressionException
     */
    public Chemical extractNextChemicalFromXMLStream(XMLEventReader eventReader)
            throws XMLStreamException, JaxenException {
        Document bufferDoc = null;
        Element currentElement = null;
        StringBuilder textBuffer = null;
        /* With help from
         * http://stackoverflow.com/questions/7998733/loading-local-chunks-in-dom-while-parsing-a-large-xml-file-in-sax-java
         */
        while (eventReader.hasNext()) {
            XMLEvent event = eventReader.nextEvent();

            switch (event.getEventType()) {
            case XMLStreamConstants.START_ELEMENT:
                String eventName = event.asStartElement().getName().getLocalPart();
                if (COMPOUND_DOC_TAG.equals(eventName)) {
                    // Create a new document if we've found the start of a compound object.
                    bufferDoc = documentBuilder.newDocument();
                    currentElement = bufferDoc.createElement(eventName);
                    bufferDoc.appendChild(currentElement);
                } else if (currentElement != null) { // Wait until we've found a compound entry to start slurping up data.
                    // Create a new child element and push down the current pointer when we find a new node.
                    Element newElement = bufferDoc.createElement(eventName);
                    currentElement.appendChild(newElement);
                    currentElement = newElement;
                } // If we aren't in a PC-Compound tree, we just let the elements pass by.
                break;

            case XMLStreamConstants.CHARACTERS:
                if (currentElement == null) { // Ignore this event if we're not in a PC-Compound tree.
                    continue;
                }

                Characters chars = event.asCharacters();
                // Ignore only whitespace strings, which just inflate the size of the DOM.  Text coalescing makes this safe.
                if (chars.isWhiteSpace()) {
                    continue;
                }

                // Rely on the XMLEventStream to coalesce consecutive text events.
                Text textNode = bufferDoc.createTextNode(chars.getData());
                currentElement.appendChild(textNode);
                break;

            case XMLStreamConstants.END_ELEMENT:
                if (currentElement == null) { // Ignore this event if we're not in a PC-Compound tree.
                    continue;
                }

                eventName = event.asEndElement().getName().getLocalPart();
                Node parentNode = currentElement.getParentNode();
                if (parentNode instanceof Element) {
                    currentElement = (Element) parentNode;
                } else if (parentNode instanceof Document && eventName.equals(COMPOUND_DOC_TAG)) {
                    // We're back at the top of the node stack!  Convert the buffered document into a Chemical.
                    PubchemEntry entry = extractPCCompoundFeatures(bufferDoc);
                    if (entry != null) {
                        return entry.asChemical();
                    } else {
                        // Skip this entry if we can't process it correctly by resetting the world and continuing on.
                        bufferDoc = null;
                        currentElement = null;
                    }
                } else {
                    // This should not happen, but is here as a sanity check.
                    throw new RuntimeException(String.format("Parent of XML element %s is of type %d, not Element",
                            currentElement.getTagName(), parentNode.getNodeType()));
                }
                break;

            // TODO: do we care about attributes or other XML structures?
            }
        }

        // Return null when we run out of chemicals, just like readLine().
        return null;
    }

    /**
     * This function reads a given gzipped XML file, passes the xml event stream to a function to parse out the chemical,
     * and writes the chemical to the db.
     * @param file The input gzipped file that is being processed.
     * @throws XMLStreamException
     * @throws IOException
     */
    public void openCompressedXMLFileAndWriteChemicals(File file)
            throws XMLStreamException, JaxenException, IOException {
        XMLEventReader eventReader = xmlInputFactory
                .createXMLEventReader(new GZIPInputStream(new FileInputStream(file), GZIP_BUFFER_SIZE));
        Chemical result;
        while ((result = extractNextChemicalFromXMLStream(eventReader)) != null) {
            writeChemicalToDB(result);
        }
    }

    /**
     * This function is the main driver of the class, which processes all the xml files of the pubchem dump.
     * @throws XMLStreamException
     * @throws IOException
     */
    private void run(List<File> filesToProcess) throws XMLStreamException, JaxenException, IOException {
        int counter = 1;
        for (File file : filesToProcess) {
            LOGGER.info("Processing file %d of %d", counter, filesToProcess.size());
            LOGGER.info("File name is %s", file.getPath());
            openCompressedXMLFileAndWriteChemicals(file);
            counter++;
        }
    }

    /**
     * This function extracts gzipped xml files from a file directory.
     * @param dataDirectory The directory of interest.
     * @return A list of files of gzipped xml files.
     * @throws XMLStreamException
     * @throws IOException
     */
    private static List<File> findGZippedFilesInDirectory(String dataDirectory)
            throws XMLStreamException, IOException {
        File folder = new File(dataDirectory);

        if (!folder.exists()) {
            String msg = String.format("The folder %s does not exists", folder.getAbsolutePath());
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }

        List<File> result = Arrays.stream(folder.listFiles()).filter(f -> f.getName().endsWith(GZIP_FILE_EXT))
                .collect(Collectors.toList());

        // Sort files lexicographically for installer stability.
        Collections.sort(result);

        return result;
    }

    public static void main(String[] args) throws Exception {
        // Parse the command line options
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(PubchemParser.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(PubchemParser.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        String dataDir = cl.getOptionValue(OPTION_DATA_DIRECTORY);
        String dbName = cl.getOptionValue(OPTION_DB);

        MongoDB db = new MongoDB("localhost", 27017, dbName);
        PubchemParser pubchemParser = new PubchemParser(db);
        pubchemParser.init();
        pubchemParser.run(findGZippedFilesInDirectory(dataDir));
    }
}