act.installer.pubchem.PubchemTTLMergerTest.java Source code

Introduction

Here is the source code for act.installer.pubchem.PubchemTTLMergerTest.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.pubchem;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.FlushOptions;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksIterator;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileAttribute;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;

public class PubchemTTLMergerTest {
    private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemTTLMergerTest.class);
    private static final String TEST_RDF_PATH = "rdf_synonyms";
    private static final String THIS_DIR = ".";
    private static final String PARENT_DIR = "..";

    private Path tempDirPath;

    @Before
    public void setUp() throws Exception {
        // Create a temporary directory where the RocksDB will live.
        tempDirPath = Files.createTempDirectory(PubchemTTLMergerTest.class.getName(), new FileAttribute[0]);
    }

    @After
    public void tearDown() throws Exception {
        // Clean up temp dir once the test is complete.  TODO: use mocks instead maybe?  But testing RocksDB helps too...
        /* With help from:
         * http://stackoverflow.com/questions/779519/delete-directories-recursively-in-java/27917071#27917071 */
        Files.walkFileTree(tempDirPath, new FileVisitor<Path>() {
            @Override
            public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
                return FileVisitResult.CONTINUE;
            }

            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                // walkFileTree may ignore . and .., but I have never found it a /bad/ idea to check for these special names.
                if (!THIS_DIR.equals(file.toFile().getName()) && !PARENT_DIR.equals(file.toFile().getName())) {
                    Files.delete(file);
                }
                return FileVisitResult.CONTINUE;
            }

            @Override
            public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
                throw exc;
            }

            @Override
            public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
                Files.delete(dir);
                return FileVisitResult.CONTINUE;
            }
        });

        // One last check to make sure the top level directory is removed.
        if (tempDirPath.toFile().exists()) {
            Files.delete(tempDirPath);
        }
    }

    public List<String> getValForKey(
            Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles,
            PubchemTTLMerger.COLUMN_FAMILIES columnFamily, String key) throws Exception {
        RocksDB db = dbAndHandles.getLeft();
        String columnFamilyName = columnFamily.getName();
        ColumnFamilyHandle cfh = dbAndHandles.getRight().get(columnFamily);
        byte[] keyBytes = key.getBytes();
        byte[] valBytes = db.get(cfh, keyBytes);
        try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
            return (List<String>) ois.readObject();
        }
    }

    public PubchemSynonyms getPCSyonymsForKey(
            Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, String key)
            throws Exception {
        byte[] valBytes = dbAndHandles.getLeft()
                .get(dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS), key.getBytes());
        try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
            return (PubchemSynonyms) ois.readObject();
        }
    }

    private static String MD51 = "MD5_00000000000000000000000000000001";
    private static String MD52 = "MD5_00000000000000000000000000000002";
    private static String MD53 = "MD5_00000000000000000000000000000003";

    @Test
    public void testIndexConstructionAndMerge() throws Exception {
        PubchemTTLMerger merger = new PubchemTTLMerger();
        Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = PubchemTTLMerger
                .createNewRocksDB(tempDirPath.toFile());

        // Alas, we can't swap this with a JAR-safe stream as we must list the files.
        File testSynonymFileDir = new File(this.getClass().getResource(TEST_RDF_PATH).getFile());
        List<File> testFiles = Arrays.asList(testSynonymFileDir.listFiles());
        Collections.sort(testFiles);

        Set<String> expectedValues, actualValues;

        merger.buildIndex(dbAndHandles, testFiles);

        dbAndHandles.getLeft().flush(new FlushOptions());

        // Check the hash-to-synonym index.
        expectedValues = new HashSet<>(Arrays.asList("test1"));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD51));
        assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
        expectedValues = new HashSet<>(Arrays.asList("test2"));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD52));
        assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
        expectedValues = new HashSet<>(Arrays.asList("TEST3", "test3"));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD53));
        assertEquals("Third hash-to-synonyms returns expected value(s)", expectedValues, actualValues);

        // Now check the MESH index.
        expectedValues = new HashSet<>(Arrays.asList("M01"));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_MESH, MD51));
        assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
        expectedValues = new HashSet<>(Arrays.asList("M02"));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_MESH, MD52));
        assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues);

        // Finally (before merging) check the CID to hash index
        expectedValues = new HashSet<>(Arrays.asList(MD51));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID01"));
        assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
        expectedValues = new HashSet<>(Arrays.asList(MD52, MD53));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID02"));
        assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
        expectedValues = new HashSet<>(Arrays.asList(MD53));
        actualValues = new HashSet<>(
                getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID03"));
        assertEquals("Third hash-to-synonyms returns expected value(s)", expectedValues, actualValues);

        merger.merge(dbAndHandles);

        PubchemSynonyms expectedSynonyms, actualSynonyms;

        expectedSynonyms = new PubchemSynonyms("CID01");
        expectedSynonyms.addMeSHId("M01");
        expectedSynonyms.addSynonym(PubchemTTLMerger.PC_SYNONYM_TYPES.TRIVIAL_NAME, "test1");
        actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID01");
        assertEquals("First CID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms,
                actualSynonyms);
        expectedSynonyms = new PubchemSynonyms("CID02");
        expectedSynonyms.addMeSHId("M02");
        expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.UNKNOWN,
                new HashSet<>(Arrays.asList("test2")));
        expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME,
                new HashSet<>(Arrays.asList("test3", "TEST3")));
        actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID02");
        assertEquals("Second CID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms,
                actualSynonyms);
        expectedSynonyms = new PubchemSynonyms("CID03");
        expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME,
                new HashSet<>(Arrays.asList("test3", "TEST3")));
        actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID03");
        assertEquals("ThirdCID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms,
                actualSynonyms);

        dbAndHandles.getLeft().flush(new FlushOptions());
        dbAndHandles.getLeft().close();
    }

    @Test
    public void testValuesAreReadableAfterIndexIsClosedAndReopened() throws Exception {
        PubchemTTLMerger merger = new PubchemTTLMerger();
        Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = PubchemTTLMerger
                .createNewRocksDB(tempDirPath.toFile());

        // Alas, we can't swap this with a JAR-safe stream as we must list the files.
        File testSynonymFileDir = new File(this.getClass().getResource(TEST_RDF_PATH).getFile());
        List<File> testFiles = Arrays.asList(testSynonymFileDir.listFiles());
        Collections.sort(testFiles);

        merger.buildIndex(dbAndHandles, testFiles);
        merger.merge(dbAndHandles);
        dbAndHandles.getLeft().close();

        dbAndHandles = merger.openExistingRocksDB(tempDirPath.toFile());

        Map<String, PubchemSynonyms> expected = new HashMap<String, PubchemSynonyms>() {
            {
                put("CID01",
                        new PubchemSynonyms("CID01", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {
                            {
                                put(PubchemTTLMerger.PC_SYNONYM_TYPES.TRIVIAL_NAME,
                                        new HashSet<>(Arrays.asList("test1")));
                            }
                        }, Arrays.asList("M01")));
                put("CID02",
                        new PubchemSynonyms("CID02", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {
                            {
                                put(PubchemTTLMerger.PC_SYNONYM_TYPES.UNKNOWN,
                                        new HashSet<>(Arrays.asList("test2")));
                                put(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME,
                                        new HashSet<>(Arrays.asList("TEST3", "test3")));
                            }
                        }, Arrays.asList("M02")));
                put("CID03",
                        new PubchemSynonyms("CID03", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {
                            {
                                put(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME,
                                        new HashSet<>(Arrays.asList("TEST3", "test3")));
                            }
                        }, Collections.emptyList()));
            }
        };

        RocksIterator iterator = dbAndHandles.getLeft()
                .newIterator(dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS));
        for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) {
            assertNotNull("Iterator key should never be null", iterator.key());
            assertNotNull("Iterator value should never be null", iterator.value());

            String key = new String(iterator.key());
            PubchemSynonyms synonyms;
            try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(iterator.value()))) {
                // We know all our values so far have been lists of strings, so this should be completely safe.
                synonyms = (PubchemSynonyms) ois.readObject();
            }
            assertEquals(String.format("Pubchem synonyms for %s match expected", key), expected.get(key), synonyms);
        }
    }
}