act.installer.pubchem.PubchemTTLMerger.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.pubchem.PubchemTTLMerger.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.pubchem;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.impl.SimpleIRI;
import org.eclipse.rdf4j.model.impl.SimpleLiteral;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.CompressionType;
import org.rocksdb.DBOptions;
import org.rocksdb.FlushOptions;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.RocksIterator;
import org.rocksdb.WriteOptions;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * This class implements a parser for Pubchem's TTL (turtle) files.  These contain both the features available in the
 * full Pubchem compound corpus, as well as other features not available in that dataset.
 */
public class PubchemTTLMerger {
    private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemTTLMerger.class);
    private static final Charset UTF8 = StandardCharsets.UTF_8;
    private static final Set<PC_SYNONYM_TYPES> DEFAULT_SYNONYM_DATA_TYPES = Collections
            .unmodifiableSet(Collections.singleton(PC_SYNONYM_TYPES.UNKNOWN));

    private static final String DEFAULT_ROCKSDB_COLUMN_FAMILY = "default";

    // Dunno why RocksDB needs two different types for these...
    private static final Options ROCKS_DB_CREATE_OPTIONS = new Options().setCreateIfMissing(true)
            .setDisableDataSync(true).setAllowMmapReads(true) // Trying all sorts of performance tweaking knobs, which are not well documented. :(
            .setAllowMmapWrites(true).setWriteBufferSize(1 << 27).setArenaBlockSize(1 << 20)
            .setCompressionType(CompressionType.SNAPPY_COMPRESSION) // Will hopefully trade CPU for I/O.
    ;

    public static final DBOptions ROCKS_DB_OPEN_OPTIONS = new DBOptions().setCreateIfMissing(false)
            .setDisableDataSync(true).setAllowMmapReads(true).setAllowMmapWrites(true);

    public static final String OPTION_INDEX_PATH = "x";
    public static final String OPTION_RDF_DIRECTORY = "d";
    public static final String OPTION_ONLY_SYNONYMS = "s";
    public static final String OPTION_ONLY_MESH = "m";
    public static final String OPTION_ONLY_PUBCHEM_IDS = "p";
    public static final String OPTION_ONLY_MERGE = "g";
    public static final String OPTION_OPEN_EXISTING_OKAY = "e";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class extracts Pubchem synonym data from RDF files into an on-disk index, then uses that index to join ",
            "the synonyms and MeSH ids with their corresponding pubchem ids." }, "");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_INDEX_PATH).argName("index path")
                    .desc("A path to the directory where the on-disk index will be stored; must not already exist")
                    .hasArg().required().longOpt("index"));
            add(Option.builder(OPTION_RDF_DIRECTORY).argName("RDF directory")
                    .desc("A path to the directory of Pubchem RDF files").hasArg().longOpt("dir"));
            add(Option.builder(OPTION_ONLY_SYNONYMS)
                    .desc(String.format("If set, only '%s' files will be processed, useful for debugging",
                            PC_RDF_DATA_FILE_CONFIG.HASH_TO_SYNONYM.filePrefix))
                    .longOpt("only-synonyms"));
            add(Option.builder(OPTION_ONLY_MESH)
                    .desc(String.format("If set, only '%s' files will be processed, useful for debugging",
                            PC_RDF_DATA_FILE_CONFIG.HASH_TO_MESH.filePrefix))
                    .longOpt("only-mesh"));
            add(Option.builder(OPTION_ONLY_PUBCHEM_IDS)
                    .desc(String.format("If set, only '%s' files will be processed, useful for debugging",
                            PC_RDF_DATA_FILE_CONFIG.HASH_TO_CID.filePrefix))
                    .longOpt("only-pubchem-id"));
            add(Option.builder(OPTION_ONLY_MERGE)
                    .desc("If set, only merge on Pubchem id, assuming other columns are populated")
                    .longOpt("only-merge"));
            add(Option.builder(OPTION_OPEN_EXISTING_OKAY)
                    .desc("Use an existing index directory.  By default, indexes must be created in one shot.")
                    .longOpt("use-existing"));
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    public PubchemTTLMerger() {

    }

    private enum PC_RDF_DATA_FILE_CONFIG {
        HASH_TO_SYNONYM("pc_synonym_value", COLUMN_FAMILIES.HASH_TO_SYNONYMS, PC_RDF_DATA_TYPES.SYNONYM,
                PC_RDF_DATA_TYPES.LITERAL, false, null), HASH_TO_CID("pc_synonym2compound",
                        COLUMN_FAMILIES.CID_TO_HASHES, PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.COMPOUND, true,
                        null), HASH_TO_MESH("pc_synonym_topic", COLUMN_FAMILIES.HASH_TO_MESH,
                                PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.MeSH, false,
                                null), HASH_TO_SYNONYM_TYPE("pc_synonym_type", COLUMN_FAMILIES.HASH_TO_SYNONYM_TYPE,
                                        PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.SIO, false,
                                        (String x) -> PC_SYNONYM_TYPES.getByCheminfId(x).name()), // Map CHEMINF values to synonym type designators.
        ;

        private String filePrefix;
        private COLUMN_FAMILIES columnFamily;
        private PC_RDF_DATA_TYPES keyType;
        private PC_RDF_DATA_TYPES valType;
        private boolean reverseSubjectAndObject;
        private Function<String, String> valueTransformer;

        PC_RDF_DATA_FILE_CONFIG(String filePrefix, COLUMN_FAMILIES columnFamily, PC_RDF_DATA_TYPES keyType,
                PC_RDF_DATA_TYPES valType, boolean reverseSubjectAndObject,
                Function<String, String> valueTransformer) {
            this.filePrefix = filePrefix;
            this.columnFamily = columnFamily;
            this.keyType = keyType;
            this.valType = valType;
            this.reverseSubjectAndObject = reverseSubjectAndObject;
            this.valueTransformer = valueTransformer;
        }

        public static PC_RDF_DATA_FILE_CONFIG getDataTypeForFile(File file) {
            String name = file.getName();
            for (PC_RDF_DATA_FILE_CONFIG t : PC_RDF_DATA_FILE_CONFIG.values()) {
                if (name.startsWith(t.filePrefix)) {
                    return t;
                }
            }
            return null;
        }

        public static AbstractRDFHandler makeHandlerForDataFile(
                Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, File file) {
            PC_RDF_DATA_FILE_CONFIG config = getDataTypeForFile(file);
            if (config == null) {
                LOGGER.info("No handler config found for file %s", file.getAbsolutePath());
                return null;
            }
            LOGGER.info("Selected handler type %s for file %s", config.name(), file.getName());

            return new PCRDFHandler(dbAndHandles, config.columnFamily, config.keyType, config.valType,
                    config.reverseSubjectAndObject, config.valueTransformer);
        }
    }

    /**
     * Each triple in the RDF files takes the form:
     * <pre>[subject namespace]:[subject value] predicate namespace]:[predicate value] [object namespace]:[object value] .</pre>
     * Some of the files contain multiple types of values, only some of which we want to store.  For example, the
     * `topics` file contains both MeSH ids and "concepts" (I'm not sure what the latter actually represents).  We can
     * identify the MeSH ids based on their namespace and throw everything else away.
     *
     * Additionally, rdf4j represents different types of values with different Java objects.  IRI stands for
     * "internationalized resource identifier" (https://www.w3.org/TR/rdf11-concepts/#dfn-iri), and acts as a pointer
     * or identifier in the PC synonym corpus.  Synonym string values are modeled as literals, which have some sort of
     * label in some language (we ignore the language for now).
     *
     * This enum is a map of the useful namespaces and associated rdf4j model types to the parts of the synonym corpus
     * we want to extract.  Check out their use in PC_RDF_DATA_FILE_CONFIG to see how these are mapped to the
     * subjects and objects of different files in the synonym corpus.
     */
    private enum PC_RDF_DATA_TYPES {
        SYNONYM("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym/", PCRDFHandler.OBJECT_TYPE.IRI), MeSH(
                "http://id.nlm.nih.gov/mesh/",
                PCRDFHandler.OBJECT_TYPE.IRI), COMPOUND("http://rdf.ncbi.nlm.nih.gov/pubchem/compound/",
                        PCRDFHandler.OBJECT_TYPE.IRI), LITERAL("langString", PCRDFHandler.OBJECT_TYPE.LITERAL), SIO(
                                "http://semanticscience.org/resource/", PCRDFHandler.OBJECT_TYPE.IRI),;

        private String urlOrDatatypeName;
        /* We only expect one kind of RDF value object at a time depending on the value's namespace, so constrain to that
         * to allow proper dispatch within the handler. */
        private PCRDFHandler.OBJECT_TYPE valueObjectType;

        PC_RDF_DATA_TYPES(String urlOrDatatypeName, PCRDFHandler.OBJECT_TYPE valueObjectType) {
            this.urlOrDatatypeName = urlOrDatatypeName;
            this.valueObjectType = valueObjectType;
        }

        public String getUrlOrDatatypeName() {
            return this.urlOrDatatypeName;
        }

        public PCRDFHandler.OBJECT_TYPE getValueObjectType() {
            return this.valueObjectType;
        }
    }

    public enum COLUMN_FAMILIES {
        HASH_TO_SYNONYMS("hash_to_synonym"), CID_TO_HASHES("cid_to_hashes"), HASH_TO_MESH(
                "hash_to_MeSH"), CID_TO_SYNONYMS("cid_to_synonyms"), HASH_TO_SYNONYM_TYPE("hash_to_synonym_type");

        private static final Map<String, COLUMN_FAMILIES> NAME_MAPPING = Collections
                .unmodifiableMap(new HashMap<String, COLUMN_FAMILIES>() {
                    {
                        for (COLUMN_FAMILIES family : COLUMN_FAMILIES.values()) {
                            put(family.name, family);
                        }
                    }
                });

        public static COLUMN_FAMILIES getFamilyByName(String name) {
            return NAME_MAPPING.get(name);
        }

        private String name;

        COLUMN_FAMILIES(String name) {
            this.name = name;
        }

        public String getName() {
            return this.name;
        }
    }

    // Note: @JsonSerialize and @JsonDeserialize didn't work here, so I've used @JsonCreator and @JsonValue instead.
    public enum PC_SYNONYM_TYPES {
        // Names derived from the Semantic Chemistry Ontology: https://github.com/egonw/semanticchemistry
        TRIVIAL_NAME("CHEMINF_000109", "trivial name", "trivial_name"), DEPOSITORY_NAME("CHEMINF_000339",
                "depositor-supplied name", "depositor_supplied_name"), IUPAC_NAME("CHEMINF_000382",
                        "IUPAC name (LexiChem)",
                        "IUPAC_name"), DRUG_BANK_ID("CHEMINF_000406", "DrugBank ID", "drugbank_id"), CHEBI_ID(
                                "CHEMINF_000407", "ChEBI ID",
                                "ChEBI_id"), KEGG_ID("CHEMINF_000409", "KEGG ID", "KEGG_ID"), CHEMBL_ID(
                                        "CHEMINF_000412", "ChEMBL ID",
                                        "ChEMBL_id"), CAS_REGISTRY_NUMBER("CHEMINF_000446", "CAS registry number",
                                                "cas_number"), EC_NUMBER("CHEMINF_000447", "EC number",
                                                        "ec_number"), VALIDATED_CHEM_DB_ID("CHEMINF_000467",
                                                                "Validated chemical database ID",
                                                                "chem_db_id"), DRUG_TRADE_NAME("CHEMINF_000561",
                                                                        "Drug trade name",
                                                                        "trade_name"), INTL_NONPROPRIETARY_NAME(
                                                                                "CHEMINF_000562",
                                                                                "International non-proprietary name",
                                                                                "non_proprietary_name"), UNIQUE_INGREDIENT_ID(
                                                                                        "CHEMINF_000563",
                                                                                        "Unique ingredient ID",
                                                                                        "unique_ingredient_id"), LIPID_MAPS_ID(
                                                                                                "CHEMINF_000564",
                                                                                                "LipidMaps ID",
                                                                                                "lipidmaps_id"), NSC_NUMBER(
                                                                                                        "CHEMINF_000565",
                                                                                                        "National Service Center number",
                                                                                                        "nsc_number"), RTECS_ID(
                                                                                                                "CHEMINF_000566",
                                                                                                                "RTECS ID",
                                                                                                                "RTECS_id"), UNKNOWN(
                                                                                                                        "NO_ID",
                                                                                                                        "Unknown",
                                                                                                                        "unknown");

        private static final Map<String, PC_SYNONYM_TYPES> CHEMINF_TO_TYPE = new HashMap<String, PC_SYNONYM_TYPES>() {
            {
                for (PC_SYNONYM_TYPES type : PC_SYNONYM_TYPES.values()) {
                    put(type.getCheminfId(), type);
                }
            }
        };

        private static final Map<String, PC_SYNONYM_TYPES> JSON_LABEL_TO_TYPE = new HashMap<String, PC_SYNONYM_TYPES>() {
            {
                for (PC_SYNONYM_TYPES type : PC_SYNONYM_TYPES.values()) {
                    put(type.getJsonLabel(), type);
                }
            }
        };

        public static PC_SYNONYM_TYPES getByCheminfId(String cheminfId) {
            return CHEMINF_TO_TYPE.getOrDefault(cheminfId, UNKNOWN);
        }

        @JsonCreator
        public static PC_SYNONYM_TYPES getByJsonLabel(String cheminfId) {
            return JSON_LABEL_TO_TYPE.getOrDefault(cheminfId, UNKNOWN);
        }

        String cheminfId;
        String label;
        String jsonLabel;

        PC_SYNONYM_TYPES(String cheminfId, String label, String jsonLabel) {
            this.cheminfId = cheminfId;
            this.label = label;
            this.jsonLabel = jsonLabel;
        }

        public String getCheminfId() {
            return cheminfId;
        }

        public String getLabel() {
            return label;
        }

        @JsonValue
        public String getJsonLabel() {
            return jsonLabel;
        }
    }

    private static class PCRDFHandler extends AbstractRDFHandler {
        public static final Double MS_PER_S = 1000.0;

        /* The Pubchem RDF corpus represents all subjects as SimpleIRIs, but objects can be IRIs or literals.  Let the child
         * class decide which one it wants to handle. */
        enum OBJECT_TYPE {
            IRI, LITERAL,;
        }

        private RocksDB db;
        private COLUMN_FAMILIES columnFamily;
        private ColumnFamilyHandle cfh;
        // Filter out RDF types (based on namespace) that we don't recognize or don't want to process.
        PC_RDF_DATA_TYPES keyType, valueType;
        boolean reverseSubjectAndObject;
        /* This is a super janky way to map synonym types to their enum values in the index.  Would be better done with a
         * subclass, but we'll leave that for a refactoring once we get this working. */
        Function<String, String> valueTransformer = null;

        DateTime startTime;
        // Is the RDF parser single threaded?  We don't know, so use an atomic counter to be safe.
        AtomicLong numProcessed = new AtomicLong(0);
        // Store unrecognized namespaces so we only log once per RDF file, rather than once per entry (which is a lot).
        Set<String> seenUnrecognizedSubjectNamespaces = new HashSet<>();
        Set<String> seenUnrecognizedObjectNamespaces = new HashSet<>();

        PCRDFHandler(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles,
                COLUMN_FAMILIES columnFamily, PC_RDF_DATA_TYPES keyType, PC_RDF_DATA_TYPES valueType,
                boolean reverseSubjectAndObject, Function<String, String> valueTransformer) {
            this.db = dbAndHandles.getLeft();
            this.columnFamily = columnFamily;
            this.cfh = dbAndHandles.getRight().get(columnFamily);
            this.keyType = keyType;
            this.valueType = valueType;
            this.reverseSubjectAndObject = reverseSubjectAndObject;
            this.valueTransformer = valueTransformer;
        }

        @Override
        public void startRDF() throws RDFHandlerException {
            super.startRDF();
            startTime = new DateTime().withZone(DateTimeZone.UTC);
        }

        @Override
        public void endRDF() throws RDFHandlerException {
            super.endRDF();
            DateTime endTime = new DateTime().withZone(DateTimeZone.UTC);
            Long runtimeInMilis = endTime.getMillis() - startTime.getMillis();
            Long numProcessedVal = numProcessed.get();
            LOGGER.info("PCRDFHandler reached end of RDF with %d events in %.3fs, at %.3f ms per event",
                    numProcessedVal, runtimeInMilis.floatValue() / MS_PER_S,
                    runtimeInMilis.doubleValue() / numProcessedVal.doubleValue());
            try {
                db.flush(new FlushOptions().setWaitForFlush(true));
            } catch (RocksDBException e) {
                LOGGER.error("Caught RocksDB exception when flushing after completing RDF processing: %s",
                        e.getMessage());
                throw new RDFHandlerException(e);
            }
        }

        @Override
        public void handleStatement(Statement st) {
            if (!(st.getSubject() instanceof SimpleIRI)) {
                // If we can't even recognize the type of the subject, something is very wrong.
                String msg = String.format("Unknown type of subject: %s",
                        st.getSubject().getClass().getCanonicalName());
                LOGGER.error(msg);
                throw new RuntimeException(msg);
            }

            SimpleIRI subjectIRI = (SimpleIRI) st.getSubject();
            // Filter out keys in namespaces we're not interested in.
            if (!(keyType.getUrlOrDatatypeName().equals(subjectIRI.getNamespace()))) {
                // If we don't recognize the namespace of the subject, then we probably can't handle this triple.
                if (!seenUnrecognizedSubjectNamespaces.contains(subjectIRI.getNamespace())) {
                    seenUnrecognizedSubjectNamespaces.add(subjectIRI.getNamespace());
                    LOGGER.warn("Unrecognized subject namespace: %s\n", subjectIRI.getNamespace());
                }
                return;
            }

            String subject = subjectIRI.getLocalName();
            String object = null;
            // Let the subclasses tell us what
            if (this.valueType.getValueObjectType() == OBJECT_TYPE.IRI && st.getObject() instanceof SimpleIRI) {
                SimpleIRI objectIRI = (SimpleIRI) st.getObject();
                if (!valueType.getUrlOrDatatypeName().equals(objectIRI.getNamespace())) {
                    // If we don't recognize the namespace of the subject, then we probably can't handle this triple.
                    if (!seenUnrecognizedObjectNamespaces.contains(objectIRI.getNamespace())) {
                        seenUnrecognizedObjectNamespaces.add(objectIRI.getNamespace());
                        LOGGER.warn("Unrecognized object namespace: %s\n", objectIRI.getNamespace());
                    }
                    return;
                }
                object = objectIRI.getLocalName();
            } else if (this.valueType.getValueObjectType() == OBJECT_TYPE.LITERAL
                    && st.getObject() instanceof SimpleLiteral) {
                SimpleLiteral objectLiteral = (SimpleLiteral) st.getObject();
                IRI datatype = objectLiteral.getDatatype();
                if (!valueType.getUrlOrDatatypeName().equals(datatype.getLocalName())) {
                    // We're only expecting string values where we find literals.
                    if (!seenUnrecognizedObjectNamespaces.contains(datatype.getLocalName())) {
                        seenUnrecognizedObjectNamespaces.add(datatype.getLocalName());
                        LOGGER.warn("Unrecognized simple literal datatype: %s\n", datatype.getLocalName());
                    }
                    return;
                }
                object = objectLiteral.getLabel();
            } else {
                String msg = String.format("Unknown type of object: %s",
                        st.getObject().getClass().getCanonicalName());
                LOGGER.error(msg);
                throw new RuntimeException(msg);
            }

            /* I considered modeling this decision using subclasses, but it made the configuration to much of a pain.  Maybe
             * we'll do something clever the next time this code needs modification... */
            Pair<String, String> kvPair;
            if (reverseSubjectAndObject) {
                // If the keys, like PC ids, are on the right, we need to swap them around before storing.
                kvPair = Pair.of(object, subject);
            } else {
                kvPair = Pair.of(subject, object);
            }

            if (valueTransformer != null) {
                kvPair = Pair.of(kvPair.getKey(), valueTransformer.apply(kvPair.getValue()));
            }

            // Store the key and value in the appropriate column family.
            appendValueToList(db, cfh, kvPair.getKey(), kvPair.getValue());
            numProcessed.incrementAndGet();
        }

        private void appendValueToList(RocksDB db, ColumnFamilyHandle cfh, String key, String val) {
            StringBuffer buffer = new StringBuffer();
            List<String> storedObjects = null;
            byte[] keyBytes = key.getBytes(UTF8);
            // TODO: pull this out into a helper class or interface.  Alas, we can must extend the AbstractRDFHandler.
            try {
                if (db.keyMayExist(cfh, keyBytes, buffer)) {
                    byte[] existingVal = db.get(cfh, keyBytes);
                    if (existingVal != null) {
                        ObjectInputStream oi = new ObjectInputStream(new ByteArrayInputStream(existingVal));
                        storedObjects = (ArrayList<String>) oi.readObject(); // Note: assumes all values are lists.
                        /* Once upon a time I had a constraint here that crashed if we expected unique keys.  This was mainly to
                         * guard against hypothetical synonym hash collisions.  What ends up happening, however, is that Pubchem
                         * stores multiple values of one hash with different normalizations (like all uppercase or all lowercase)
                         * meaning there *will* be multiple values with the same hash, but these values will all be valid.
                         * Instead we just ignore potential hash collisions and assume that any "collisions" are intentional. */
                    } else {
                        storedObjects = new ArrayList<>(1);
                    }
                } else {
                    storedObjects = new ArrayList<>(1);
                }

                storedObjects.add(val);

                try (ByteArrayOutputStream bos = new ByteArrayOutputStream();
                        ObjectOutputStream oo = new ObjectOutputStream(bos)) {
                    oo.writeObject(storedObjects);
                    oo.flush();

                    db.put(cfh, new WriteOptions(), keyBytes, bos.toByteArray());
                }
            } catch (RocksDBException e) {
                LOGGER.error("Caughted unexpected RocksDBException: %s", e.getMessage());
                throw new RuntimeException(e);
            } catch (IOException e) {
                LOGGER.error("Caughted unexpected IOException: %s", e.getMessage());
                throw new RuntimeException(e);
            } catch (ClassNotFoundException e) {
                LOGGER.error("Caughted unexpected ClassNotFoundEXception: %s", e.getMessage());
                throw new RuntimeException(e);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        org.apache.commons.cli.Options opts = new org.apache.commons.cli.Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        PubchemTTLMerger merger = new PubchemTTLMerger();

        File rocksDBFile = new File(cl.getOptionValue(OPTION_INDEX_PATH));

        if (cl.hasOption(OPTION_ONLY_MERGE)) {
            if (!(rocksDBFile.exists() && rocksDBFile.isDirectory())) {
                System.err.format("Must specify an existing RocksDB index when using '%s'.\n", OPTION_ONLY_MERGE);
                HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
                System.exit(1);
            }
            merger.finish(merger.merge(rocksDBFile));
            return;
        }

        File rdfDir = new File(cl.getOptionValue(OPTION_RDF_DIRECTORY));
        if (!rdfDir.isDirectory()) {
            System.err.format("Must specify a directory of RDF files to be parsed.\n");
            HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        File[] filesInDirectoryArray = rdfDir.listFiles(new FilenameFilter() {
            private static final String TTL_GZ_SUFFIX = ".ttl.gz";

            @Override
            public boolean accept(File dir, String name) {
                return name.endsWith(TTL_GZ_SUFFIX);
            }
        });

        if (filesInDirectoryArray == null || filesInDirectoryArray.length == 0) {
            System.err.format("Found zero compressed TTL files in directory at '%s'.\n", rdfDir.getAbsolutePath());
            HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        // Sort files for stability/sanity.
        List<File> filesInDirectory = Arrays.asList(filesInDirectoryArray);
        Collections.sort(filesInDirectory);

        if (cl.hasOption(OPTION_ONLY_SYNONYMS)) {
            filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_SYNONYM);
        }

        if (cl.hasOption(OPTION_ONLY_MESH)) {
            filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_MESH);
        }

        if (cl.hasOption(OPTION_ONLY_PUBCHEM_IDS)) {
            filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_CID);
        }

        if (filesInDirectory.size() == 0) {
            System.err.format(
                    "Arrived at index initialization with no files to process.  "
                            + "Maybe too many filters were specified?  synonyms: %s, MeSH: %s, Pubchem ids: %s\n",
                    cl.hasOption(OPTION_ONLY_SYNONYMS), cl.hasOption(OPTION_ONLY_MESH),
                    cl.hasOption(OPTION_ONLY_PUBCHEM_IDS));
            HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        RocksDB.loadLibrary();
        Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = null;
        try {
            if (rocksDBFile.exists()) {
                if (!cl.hasOption(OPTION_OPEN_EXISTING_OKAY)) {
                    System.err.format(
                            "Index directory at '%s' already exists, delete before retrying or add '%s' option to reuse.\n",
                            rocksDBFile.getAbsolutePath(), OPTION_OPEN_EXISTING_OKAY);
                    HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                            true);
                    System.exit(1);
                } else {
                    LOGGER.info("Reusing existing index at %s", rocksDBFile.getAbsolutePath());
                    dbAndHandles = openExistingRocksDB(rocksDBFile);
                }
            } else {
                LOGGER.info("Creating new index at %s", rocksDBFile.getAbsolutePath());
                dbAndHandles = createNewRocksDB(rocksDBFile);
            }
            merger.buildIndex(dbAndHandles, filesInDirectory);

            merger.merge(dbAndHandles);
        } finally {
            if (dbAndHandles != null) {
                merger.finish(dbAndHandles);
            }
        }
    }

    protected static List<File> filterByFileContents(List<File> files, PC_RDF_DATA_FILE_CONFIG fileConfig) {
        return files.stream().filter(x -> x.getName().startsWith(fileConfig.filePrefix))
                .collect(Collectors.toList());
    }

    protected static Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> createNewRocksDB(File pathToIndex)
            throws RocksDBException {
        RocksDB db = null; // Not auto-closable.
        Map<COLUMN_FAMILIES, ColumnFamilyHandle> columnFamilyHandles = new HashMap<>();

        Options options = ROCKS_DB_CREATE_OPTIONS;
        System.out.println("Opening index at " + pathToIndex.getAbsolutePath());
        db = RocksDB.open(options, pathToIndex.getAbsolutePath());

        for (COLUMN_FAMILIES cf : COLUMN_FAMILIES.values()) {
            LOGGER.info("Creating column family %s", cf.getName());
            ColumnFamilyHandle cfh = db.createColumnFamily(new ColumnFamilyDescriptor(cf.getName().getBytes(UTF8)));
            columnFamilyHandles.put(cf, cfh);
        }

        return Pair.of(db, columnFamilyHandles);
    }

    /**
     * Open an existing RocksDB index.  Use this after successful index generation to access the map of Pubchem compound
     * ids to synonyms/MeSH ids using the column family CID_TO_SYNONYMS.
     * @param pathToIndex A path to the RocksDB index directory to use.
     * @return
     * @throws RocksDBException
     */
    public static Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> openExistingRocksDB(File pathToIndex)
            throws RocksDBException {
        List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>(COLUMN_FAMILIES.values().length + 1);
        // Must also open the "default" family or RocksDB will probably choke.
        columnFamilyDescriptors.add(new ColumnFamilyDescriptor(DEFAULT_ROCKSDB_COLUMN_FAMILY.getBytes()));
        for (COLUMN_FAMILIES family : COLUMN_FAMILIES.values()) {
            columnFamilyDescriptors.add(new ColumnFamilyDescriptor(family.getName().getBytes()));
        }
        List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>(columnFamilyDescriptors.size());

        DBOptions dbOptions = ROCKS_DB_OPEN_OPTIONS;
        dbOptions.setCreateIfMissing(false);
        RocksDB rocksDB = RocksDB.open(dbOptions, pathToIndex.getAbsolutePath(), columnFamilyDescriptors,
                columnFamilyHandles);
        Map<COLUMN_FAMILIES, ColumnFamilyHandle> columnFamilyHandleMap = new HashMap<>(
                COLUMN_FAMILIES.values().length);
        // TODO: can we zip these together more easily w/ Java 8?

        for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
            ColumnFamilyDescriptor cfd = columnFamilyDescriptors.get(i);
            ColumnFamilyHandle cfh = columnFamilyHandles.get(i);
            String familyName = new String(cfd.columnFamilyName(), UTF8);
            COLUMN_FAMILIES descriptorFamily = COLUMN_FAMILIES.getFamilyByName(familyName);
            if (descriptorFamily == null) {
                if (!DEFAULT_ROCKSDB_COLUMN_FAMILY.equals(familyName)) {
                    String msg = String.format(
                            "Found unexpected family name '%s' when trying to open RocksDB at %s", familyName,
                            pathToIndex.getAbsolutePath());
                    LOGGER.error(msg);
                    // Crash if we don't recognize the contents of this DB.
                    throw new RuntimeException(msg);
                }
                // Just skip this column family if it doesn't map to something we know but is expected.
                continue;
            }

            columnFamilyHandleMap.put(descriptorFamily, cfh);
        }

        return Pair.of(rocksDB, columnFamilyHandleMap);
    }

    protected Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> merge(File pathToRocksDB)
            throws RocksDBException, IOException, ClassNotFoundException {
        Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = openExistingRocksDB(pathToRocksDB);
        merge(dbAndHandles);
        return dbAndHandles;
    }

    protected void merge(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles)
            throws RocksDBException, IOException, ClassNotFoundException {
        LOGGER.info("Beginning merge on Pubchem CID");
        RocksDB db = dbAndHandles.getLeft();
        ColumnFamilyHandle pubchemIdCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.CID_TO_HASHES);
        ColumnFamilyHandle meshCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_MESH);
        ColumnFamilyHandle synonymCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_SYNONYMS);
        ColumnFamilyHandle synonymTypeCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_SYNONYM_TYPE);
        ColumnFamilyHandle mergeResultsCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.CID_TO_SYNONYMS);

        RocksIterator cidIterator = db.newIterator(pubchemIdCFH);
        // With help from https://github.com/facebook/rocksdb/wiki/Basic-Operations
        int processed = 0;
        for (cidIterator.seekToFirst(); cidIterator.isValid(); cidIterator.next()) {
            byte[] key = cidIterator.key();
            byte[] val = cidIterator.value();
            String pubchemId = new String(key, UTF8);
            List<String> hashes;
            try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(val))) {
                // We know all our values so far have been lists of strings, so this should be completely safe.
                hashes = (List<String>) ois.readObject();
            }

            PubchemSynonyms pubchemSynonyms = new PubchemSynonyms(pubchemId);

            /* The hash keys are based on synonym value, which we can manually compute with:
             *   $ echo -n  'dimethyltin(iv)' | md5
             * This means that MeSH ids are linked to synonyms rather than pubchem ids.  We need to look up each cid-linked
             * hash in both the MeSH and synonym collections, as the key may legitimately exist in both (and serve to link
             * cid to synonym and cid to MeSH). */
            for (String hash : hashes) {
                /* Note: these ids are not proper MeSH topic ids, but are internal MeSH ids found in the RDF and TTL
                 * representations of the MeSH corpus.  You can find them in the MeSH .nt or .xml files, but they won't turn up
                 * anything on the MeSH website. */
                List<String> meshIds = getValueAsObject(db, meshCFH, hash);
                if (meshIds != null) {
                    pubchemSynonyms.addMeSHIds(meshIds);
                }

                List<String> synonyms = getValueAsObject(db, synonymCFH, hash);
                // There are, surprisingly, some dangling hashes in the DB!  Handle them gracefully.
                if (synonyms == null) {
                    LOGGER.warn(
                            "Dangling synonym hash reference, adding empty list in place of value: cid = %s, hash = %s",
                            pubchemId, hash);
                    synonyms = Collections.emptyList();
                }

                List<String> synonymTypeStrings = getValueAsObject(db, synonymTypeCFH, hash);
                Set<PC_SYNONYM_TYPES> synonymTypes = DEFAULT_SYNONYM_DATA_TYPES;
                if (synonymTypeStrings != null) {
                    synonymTypes = synonymTypeStrings.stream().map(PC_SYNONYM_TYPES::valueOf)
                            .collect(Collectors.toSet());
                }

                if (synonymTypes.size() == 0) {
                    LOGGER.warn("Found zero synonym types for synonym, defaulting to %s: %s %s, synonyms = %s",
                            PC_SYNONYM_TYPES.UNKNOWN.name(), pubchemId, hash, StringUtils.join(synonyms, ", "));
                }
                /* It turns out that *lots* of synonyms are duplicated as depositor supplied names, so don't complain about it
                 * here.  For performance sake we might want to consider changing the data model of PubchemSynonyms to reduce
                 * synonym string duplication, as the current model is pretty inefficient. */

                for (PC_SYNONYM_TYPES synonymType : synonymTypes) {
                    for (String synonym : synonyms) {
                        // Let the PubchemSynonyms object do the de-duplication for us rather than reducing `synonyms` to a Set.
                        pubchemSynonyms.addSynonym(synonymType, synonym);
                    }
                }
            }

            try (ByteArrayOutputStream bos = new ByteArrayOutputStream();
                    ObjectOutputStream oo = new ObjectOutputStream(bos)) {
                oo.writeObject(pubchemSynonyms);
                oo.flush();

                db.put(mergeResultsCFH, key, bos.toByteArray());
            }

            processed++;
            if (processed % 100000 == 0) {
                LOGGER.info("Merged %d entries on Pubchem compound id", processed);
            }
        }
        LOGGER.info("Merge complete, %d entries processed", processed);
    }

    protected <T> T getValueAsObject(RocksDB db, ColumnFamilyHandle cfh, String key)
            throws RocksDBException, ClassNotFoundException, IOException {
        StringBuffer stringBuffer = new StringBuffer();
        T val = null;
        /* Check for existence before fetching.  IIRC doing otherwise might cause segfaults in the RocksDB JNI wrapper.
         * Or it might just be faster thanks to the DB's bloom filter. */
        if (db.keyMayExist(cfh, key.getBytes(), stringBuffer)) {
            byte[] valBytes = db.get(cfh, key.getBytes());
            // Make sure that the key actually exist (beware the "May" in keyMayExist).
            if (valBytes != null) {
                try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
                    val = (T) ois.readObject();
                }
            }
        }
        return val;
    }

    protected void buildIndex(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles,
            List<File> rdfFiles) throws RocksDBException, ClassNotFoundException, IOException {
        LOGGER.info("Building RocksDB index of data in RDF files");
        RDFParser parser = Rio.createParser(RDFFormat.TURTLE);

        LOGGER.info("Processing %d RDF files", rdfFiles.size());
        for (File rdfFile : rdfFiles) {
            LOGGER.info("Processing file %s", rdfFile.getAbsolutePath());
            AbstractRDFHandler handler = PC_RDF_DATA_FILE_CONFIG.makeHandlerForDataFile(dbAndHandles, rdfFile);
            if (handler == null) {
                LOGGER.info("Skipping file without defined handler: %s", rdfFile.getAbsolutePath());
                continue;
            }

            parser.setRDFHandler(handler);
            parser.parse(new GZIPInputStream(new FileInputStream(rdfFile)), "");
            LOGGER.info("Successfully parsed file at %s", rdfFile.getAbsolutePath());
        }
        LOGGER.info("Done processing RDF files");
    }

    protected void finish(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles) {
        LOGGER.info("Closing DB to complete merge.");
        dbAndHandles.getLeft().close();
    }
}