org.opencb.cellbase.lib.loader.MongoDBCellBaseLoader.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.cellbase.lib.loader.MongoDBCellBaseLoader.java

Source

/*
 * Copyright 2015 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.loader;

import com.mongodb.bulk.BulkWriteResult;
import org.apache.commons.lang3.StringUtils;
import org.bson.Document;
import org.opencb.cellbase.core.api.CellBaseDBAdaptor;
import org.opencb.cellbase.core.api.DBAdaptorFactory;
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.config.DatabaseCredentials;
import org.opencb.cellbase.core.loader.CellBaseLoader;
import org.opencb.cellbase.core.loader.LoadRunner;
import org.opencb.cellbase.core.loader.LoaderException;
import org.opencb.cellbase.lib.MongoDBCollectionConfiguration;
import org.opencb.cellbase.lib.impl.MongoDBAdaptorFactory;
import org.opencb.commons.datastore.core.DataStoreServerAddress;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.datastore.mongodb.MongoDBConfiguration;
import org.opencb.commons.datastore.mongodb.MongoDataStore;
import org.opencb.commons.datastore.mongodb.MongoDataStoreManager;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.BlockingQueue;

/**
 * Created by parce on 18/02/15.
 */
public class MongoDBCellBaseLoader extends CellBaseLoader {

    private MongoDataStoreManager mongoDataStoreManager;
    private MongoDataStore mongoDataStore;
    private MongoDBCollection mongoDBCollection;

    private DBAdaptorFactory dbAdaptorFactory;
    @Deprecated
    private CellBaseDBAdaptor dbAdaptor;

    private Path indexScriptFolder;
    private int[] chunkSizes;
    private String clinicalVariantSource;

    private static final String CLINVARVARIANTSOURCE = "clinvar";
    private static final String COSMICVARIANTSOURCE = "cosmic";
    private static final String GWASVARIANTSOURCE = "gwas";

    public MongoDBCellBaseLoader(BlockingQueue<List<String>> queue, String data, String database) {
        this(queue, data, database, null, null, null);
    }

    public MongoDBCellBaseLoader(BlockingQueue<List<String>> queue, String data, String database, String field,
            String[] innerFields, CellBaseConfiguration cellBaseConfiguration) {
        super(queue, data, database, field, innerFields, cellBaseConfiguration);
        if (cellBaseConfiguration.getDatabases().getMongodb().getOptions().get("mongodb-index-folder") != null) {
            indexScriptFolder = Paths.get(
                    cellBaseConfiguration.getDatabases().getMongodb().getOptions().get("mongodb-index-folder"));
        }
    }

    @Override
    public void init() throws LoaderException {
        /*
         * OpenCB 'datastore' project is used to load data into MongoDB. The following code:
         * 1. creates a Manager to connect to a physical server
         * 2. a 'datastore' object connects to a specific database
         * 3. finally a connection to the collection is stored in 'mongoDBCollection'
         */

        //        DatabaseProperties mongodbCredentials = cellBaseConfiguration.getDatabases().get("mongodb");
        DatabaseCredentials mongodbCredentials = cellBaseConfiguration.getDatabases().getMongodb();

        String[] hosts = mongodbCredentials.getHost().split(",");
        List<DataStoreServerAddress> dataStoreServerAddressList = new ArrayList<>(hosts.length);
        for (String host : hosts) {
            String[] hostAndPort = host.split(":");
            dataStoreServerAddressList.add(new DataStoreServerAddress(hostAndPort[0],
                    (hostAndPort.length == 2) ? Integer.parseInt(hostAndPort[1]) : 27017));
        }
        mongoDataStoreManager = new MongoDataStoreManager(dataStoreServerAddressList);

        MongoDBConfiguration mongoDBConfiguration;
        if (cellBaseConfiguration != null && mongodbCredentials.getOptions().get("authenticationDatabase") != null
                && !mongodbCredentials.getOptions().get("authenticationDatabase").isEmpty()) {
            mongoDBConfiguration = MongoDBConfiguration.builder().add("username", mongodbCredentials.getUser())
                    .add("password", mongodbCredentials.getPassword())
                    .add("authenticationDatabase", mongodbCredentials.getOptions().get("authenticationDatabase"))
                    .build();
            logger.debug("MongoDB 'authenticationDatabase' database parameter set to '{}'",
                    mongodbCredentials.getOptions().get("authenticationDatabase"));
        } else {
            mongoDBConfiguration = MongoDBConfiguration.builder().add("username", mongodbCredentials.getUser())
                    .add("password", mongodbCredentials.getPassword()).build();
        }
        logger.debug("MongoDB credentials are user: '{}', password: '{}'", mongodbCredentials.getUser(),
                mongodbCredentials.getPassword());

        mongoDataStore = mongoDataStoreManager.get(database, mongoDBConfiguration);

        String collectionName = getCollectionName(data);
        mongoDBCollection = mongoDataStore.getCollection(collectionName);
        logger.debug("Connection to MongoDB datastore '{}' created, collection '{}' is used",
                mongoDataStore.getDatabaseName(), collectionName);

        // Some collections need to add an extra _chunkIds field to speed up some queries
        getChunkSizes(collectionName);
        logger.debug("Chunk sizes '{}' used for collection '{}'", Arrays.toString(chunkSizes), collectionName);

        dbAdaptorFactory = new MongoDBAdaptorFactory(cellBaseConfiguration);
        dbAdaptor = getDBAdaptor(data);
    }

    @Deprecated
    private CellBaseDBAdaptor getDBAdaptor(String data) throws LoaderException {
        String[] databaseParts = database.split("_");
        String species = databaseParts[1];
        //        String assembly = databaseParts[2];
        CellBaseDBAdaptor dbAdaptor;
        switch (data) {
        case "genome_info":
            //                dbAdaptor = dbAdaptorFactory.getGenomeDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "genome_sequence":
            //                dbAdaptor = dbAdaptorFactory.getGenomeDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "gene":
            //                dbAdaptor = dbAdaptorFactory.getGeneDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "variation":
            // Default assembly will be selected - it is a bad idea to get the assembly from the database name since
            // '-', '_', '.' symbols are removed from the assembly before building the database name. This getAdaptor
            // method will soon be remove
            dbAdaptor = dbAdaptorFactory.getVariationDBAdaptor(species);
            dbAdaptor = null;
            break;
        case "cadd":
            ////                dbAdaptor = dbAdaptorFactory.getVariantFunctionalScoreDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "regulatory_region":
            //                dbAdaptor = dbAdaptorFactory.getRegulationDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "protein":
            //                dbAdaptor = dbAdaptorFactory.getProteinDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "protein_protein_interaction":
            //                dbAdaptor = dbAdaptorFactory.getProteinProteinInteractionDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        //            // TODO: implement an adaptor for protein_functional_prediction - current queries are issued from the
        //            // TODO: ProteinDBAdaptors, that's why there isn't one yet
        case "protein_functional_prediction":
            dbAdaptor = null;
            //                collectionName = "protein_functional_prediction";
            break;
        case "conservation":
            //                dbAdaptor = dbAdaptorFactory.getConservationDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "cosmic":
            clinicalVariantSource = "cosmic";
            //                dbAdaptor = dbAdaptorFactory.getClinicalDBAdaptor(species, assembly);
            dbAdaptor = null;
            break;
        case "clinvar":
            clinicalVariantSource = "clinvar";
            // Default assembly will be selected - it is a bad idea to get the assembly from the database name since
            // '-', '_', '.' symbols are removed from the assembly before building the database name. This getAdaptor
            // method will soon be remove
            dbAdaptor = dbAdaptorFactory.getClinicalDBAdaptor(species);
            break;
        case "gwas":
            clinicalVariantSource = "gwas";
            // Default assembly will be selected - it is a bad idea to get the assembly from the database name since
            // '-', '_', '.' symbols are removed from the assembly before building the database name. This getAdaptor
            // method will soon be remove
            dbAdaptor = dbAdaptorFactory.getClinicalDBAdaptor(species);
            break;
        case "clinical":
            // Default assembly will be selected - it is a bad idea to get the assembly from the database name since
            // '-', '_', '.' symbols are removed from the assembly before building the database name. This getAdaptor
            // method will soon be remove
            dbAdaptor = dbAdaptorFactory.getClinicalDBAdaptor(species);
            break;
        case "metadata":
            dbAdaptor = null;
            //                collectionName = "protein_functional_prediction";
            break;
        default:
            throw new LoaderException("Unknown data to load: '" + data + "'");
        }

        return dbAdaptor;
    }

    // TODO: use adaptors within MongoDBCellBaseLoader, avoid using mongoDBCollection and remove this method
    private String getCollectionName(String data) throws LoaderException {
        String collectionName;
        switch (data) {
        case "genome_info":
            collectionName = "genome_info";
            break;
        case "genome_sequence":
            collectionName = "genome_sequence";
            break;
        case "gene":
            collectionName = "gene";
            break;
        case "variation":
            collectionName = "variation";
            break;
        case "cadd":
            collectionName = "variation_functional_score";
            break;
        case "regulatory_region":
            collectionName = "regulatory_region";
            break;
        case "protein":
            collectionName = "protein";
            break;
        case "protein_protein_interaction":
            collectionName = "protein_protein_interaction";
            break;
        case "protein_functional_prediction":
            collectionName = "protein_functional_prediction";
            break;
        case "conservation":
            collectionName = "conservation";
            break;
        case "cosmic":
            clinicalVariantSource = "cosmic";
            collectionName = "clinical";
            break;
        case "clinvar":
            clinicalVariantSource = "clinvar";
            collectionName = "clinical";
            break;
        case "gwas":
            clinicalVariantSource = "gwas";
            collectionName = "clinical";
            break;
        case "clinical":
            collectionName = "clinical";
            break;
        case "metadata":
            collectionName = "metadata";
            break;
        default:
            throw new LoaderException("Unknown data to load: '" + data + "'");
        }

        return collectionName;
    }

    private void getChunkSizes(String collectionName) {
        if (collectionName != null) {
            switch (collectionName) {
            case "genome_sequence":
                chunkSizes = new int[] { MongoDBCollectionConfiguration.GENOME_SEQUENCE_CHUNK_SIZE };
                break;
            case "gene":
                chunkSizes = new int[] { MongoDBCollectionConfiguration.GENE_CHUNK_SIZE };
                break;
            case "variation": // TODO: why are we using different chunk sizes??
                chunkSizes = new int[] { MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE,
                        10 * MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE, };
                break;
            case "variation_functional_score":
                chunkSizes = new int[] { MongoDBCollectionConfiguration.VARIATION_FUNCTIONAL_SCORE_CHUNK_SIZE };
                break;
            case "regulatory_region":
                chunkSizes = new int[] { MongoDBCollectionConfiguration.REGULATORY_REGION_CHUNK_SIZE };
                break;
            case "conservation":
                chunkSizes = new int[] { MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE };
                break;
            default:
                break;
            }
        }
    }

    @Override
    public Integer call() {
        if (field != null) {
            return prepareBatchAndUpdate();
        } else {
            return prepareBatchAndLoad();
        }
    }

    private int prepareBatchAndUpdate() {
        int numLoadedObjects = 0;
        boolean finished = false;
        while (!finished) {
            try {
                List<String> batch = blockingQueue.take();
                if (batch == LoadRunner.POISON_PILL) {
                    finished = true;
                } else {
                    List<Document> dbObjectsBatch = new ArrayList<>(batch.size());
                    for (String jsonLine : batch) {
                        Document dbObject = Document.parse(jsonLine);
                        dbObjectsBatch.add(dbObject);
                    }

                    Long numUpdates = (Long) dbAdaptor.update(dbObjectsBatch, field, innerFields).first();
                    numLoadedObjects += numUpdates;
                }
            } catch (InterruptedException e) {
                logger.error("Loader thread interrupted: " + e.getMessage());
            } catch (Exception e) {
                logger.error("Error Loading batch: " + e.getMessage());
            }
        }
        logger.debug("'load' finished. " + numLoadedObjects + " records loaded");
        return numLoadedObjects;
    }

    private int prepareBatchAndLoad() {
        int numLoadedObjects = 0;
        boolean finished = false;
        while (!finished) {
            try {
                List<String> batch = blockingQueue.take();
                if (batch == LoadRunner.POISON_PILL) {
                    finished = true;
                } else {
                    List<Document> documentBatch = new ArrayList<>(batch.size());
                    for (String jsonLine : batch) {
                        Document document = Document.parse(jsonLine);
                        addChunkId(document);
                        addClinicalPrivateFields(document);
                        //                        addVariationPrivateFields(document);
                        documentBatch.add(document);
                    }
                    numLoadedObjects += load(documentBatch);
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
                logger.error("Loader thread interrupted: " + e.getMessage());
            } catch (Exception e) {
                e.printStackTrace();
                logger.error("Error Loading batch: " + e.getMessage());
            }
        }
        logger.debug("'load' finished. " + numLoadedObjects + " records loaded");
        return numLoadedObjects;
    }

    //    private void addVariationPrivateFields(Document document) {
    //        if (data.equals("variation")) {
    //            document.put("_id", buildId((String) document.get("chromosome"), (int) document.get("start"),
    //                    (String) document.get("reference"), (String) document.get("alternate")));
    //        }
    //    }

    private void addClinicalPrivateFields(Document document) {
        if (clinicalVariantSource != null) {
            List<String> geneIdList = null;
            List<String> phenotypeList = null;
            switch (clinicalVariantSource) {
            case CLINVARVARIANTSOURCE:
                geneIdList = getClinvarGeneIds(document);
                phenotypeList = getClinvarPhenotypes(document);
                break;
            case COSMICVARIANTSOURCE:
                geneIdList = document.get("geneName") != null
                        ? Collections.singletonList(document.getString("geneName"))
                        : null;
                phenotypeList = getCosmicPhenotypes(document);
                break;
            case GWASVARIANTSOURCE:
                geneIdList = document.get("reportedGenes") != null
                        ? Collections.singletonList(document.getString("reportedGenes"))
                        : null;
                phenotypeList = getGwasPhenotypes(document);
                break;
            default:
                break;
            }
            if (geneIdList != null) {
                document.put("_geneIds", geneIdList);
            }
            if (phenotypeList != null) {
                document.put("_phenotypes", phenotypeList);
            }
        }
    }

    private List<String> getGwasPhenotypes(Document document) {
        List<String> phenotypeList = new ArrayList<>();
        List studiesDBList = document.get("studies", List.class);
        for (Object studyObject : studiesDBList) {
            Document studyDBObject = (Document) studyObject;
            List traitsDBList = studyDBObject.get("traits", List.class);
            if (traitsDBList != null) {
                for (Object traitObject : traitsDBList) {
                    Document traitDBObject = (Document) traitObject;
                    if (traitDBObject.get("diseaseTrait") != null) {
                        phenotypeList.add(traitDBObject.getString("diseaseTrait"));
                    }
                }
            }
        }
        return phenotypeList;
    }

    private List<String> getCosmicPhenotypes(Document document) {
        List<String> phenotypeList = new ArrayList<>(4);
        addIfNotEmpty((String) document.get("primarySite"), phenotypeList);
        addIfNotEmpty((String) document.get("histologySubtype"), phenotypeList);
        addIfNotEmpty((String) document.get("primaryHistology"), phenotypeList);
        addIfNotEmpty((String) document.get("siteSubtype"), phenotypeList);

        return phenotypeList;

    }

    private void addIfNotEmpty(String element, List<String> stringList) {
        if (element != null && !element.isEmpty()) {
            stringList.add(element);
        }
    }

    private List<String> getClinvarPhenotypes(Document dbObject) {
        List<String> phenotypeList = new ArrayList<>();
        List basicDBList = ((Document) ((Document) ((Document) dbObject.get("clinvarSet"))
                .get("referenceClinVarAssertion")).get("traitSet")).get("trait", List.class);
        for (Object object : basicDBList) {
            Document document = (Document) object;
            List nameDBList = document.get("name", List.class);
            if (nameDBList != null) {
                for (Object nameObject : nameDBList) {
                    Document elementValueDBObject = (Document) ((Document) nameObject).get("elementValue");
                    if (elementValueDBObject != null) {
                        String phenotype = (String) elementValueDBObject.get("value");
                        if (phenotype != null) {
                            phenotypeList.add(phenotype);
                        }
                    }
                }

            }
        }
        if (phenotypeList.size() > 0) {
            return phenotypeList;
        } else {
            return null;
        }
    }

    private List<String> getClinvarGeneIds(Document dbObject) {
        List<String> geneIdList = new ArrayList<>();
        List basicDBList = ((Document) ((Document) ((Document) dbObject.get("clinvarSet"))
                .get("referenceClinVarAssertion")).get("measureSet")).get("measure", List.class);
        for (Object object : basicDBList) {
            Document document = (Document) object;
            List measureRelationshipDBList = document.get("measureRelationship", List.class);
            if (measureRelationshipDBList != null) {
                for (Object measureRelationShipObject : measureRelationshipDBList) {
                    List symbolDBList = ((Document) measureRelationShipObject).get("symbol", List.class);
                    if (symbolDBList != null) {
                        for (Object symbolObject : symbolDBList) {
                            Document elementValueDBObject = (Document) ((Document) symbolObject)
                                    .get("elementValue");
                            if (elementValueDBObject != null) {
                                String geneId = (String) elementValueDBObject.get("value");
                                if (geneId != null) {
                                    geneIdList.add(geneId);
                                }
                            }
                        }
                    }
                }
            }
        }
        if (geneIdList.size() > 0) {
            return geneIdList;
        } else {
            return null;
        }
    }

    @Override
    public void createIndex(String data) throws LoaderException {
        Path indexFilePath = getIndexFilePath(data);
        if (indexFilePath != null) {
            logger.info("Creating indexes...");
            try {
                runCreateIndexProcess(indexFilePath);
            } catch (IOException | InterruptedException e) {
                e.printStackTrace();
            }
        } else {
            logger.warn("No index found for '{}'", data);
        }
    }

    public int load(List<Document> batch) {
        // TODO: queryOptions?
        QueryResult<BulkWriteResult> result = mongoDBCollection.insert(batch, new QueryOptions());
        return result.first().getInsertedCount();
    }

    private void addChunkId(Document document) {
        if (chunkSizes != null && chunkSizes.length > 0) {
            List<String> chunkIds = new ArrayList<>();
            for (int chunkSize : chunkSizes) {
                int chunkStart = (Integer) document.get("start") / chunkSize;
                int chunkEnd = (Integer) document.get("end") / chunkSize;
                String chunkIdSuffix = chunkSize / 1000 + "k";
                for (int i = chunkStart; i <= chunkEnd; i++) {
                    if (document.containsKey("chromosome")) {
                        chunkIds.add(document.get("chromosome") + "_" + i + "_" + chunkIdSuffix);
                    } else {
                        chunkIds.add(document.get("sequenceName") + "_" + i + "_" + chunkIdSuffix);
                    }
                }
            }
            logger.debug("Setting chunkIds to {}", chunkIds.toString());
            document.put("_chunkIds", chunkIds);
        }
    }

    @Override
    public void close() {
        mongoDataStoreManager.close(database);
    }

    private Path getIndexFilePath(String data) throws LoaderException {
        if (indexScriptFolder == null || data == null) {
            logger.error("No path can be provided for index, check index folder '{}' and data '{}'",
                    indexScriptFolder, data);
            return null;
        }

        String indexFileName = null;
        switch (data) {
        case "genome_info":
            indexFileName = null;
            break;
        case "genome_sequence":
            indexFileName = "genome_sequence-indexes.js";
            break;
        case "gene":
            indexFileName = "gene-indexes.js";
            break;
        case "variation":
            indexFileName = "variation-indexes.js";
            break;
        case "variation_functional_score":
            indexFileName = "variation_functional_score-indexes.js";
            break;
        case "regulatory_region":
            indexFileName = "regulatory_region-indexes.js";
            break;
        case "protein":
            indexFileName = "protein-indexes.js";
            break;
        case "protein_protein_interaction":
            indexFileName = "protein_protein_interaction-indexes.js";
            break;
        case "protein_functional_prediction":
            indexFileName = "protein_functional_prediction-indexes.js";
            break;
        case "conservation":
            indexFileName = "conservation-indexes.js";
            break;
        case "cosmic":
        case "clinvar":
        case "gwas":
        case "clinical":
            indexFileName = "clinical-indexes.js";
            break;
        default:
            break;
        }
        if (indexFileName == null) {
            return null;
        }
        return indexScriptFolder.resolve(indexFileName);
    }

    protected boolean runCreateIndexProcess(Path indexFilePath) throws IOException, InterruptedException {
        //        DatabaseProperties mongodbCredentials = cellBaseConfiguration.getDatabases().get("mongodb");
        DatabaseCredentials mongodbCredentials = cellBaseConfiguration.getDatabases().getMongodb();
        List<String> args = new ArrayList<>();
        args.add("mongo");
        args.add("--host");
        args.add(mongodbCredentials.getHost());
        if (mongodbCredentials.getUser() != null && !mongodbCredentials.getUser().equals("")) {
            args.addAll(Arrays.asList("-u", mongodbCredentials.getUser(), "-p", mongodbCredentials.getPassword()));
        }
        if (cellBaseConfiguration != null
                && mongodbCredentials.getOptions().get("authenticationDatabase") != null) {
            args.add("--authenticationDatabase");
            args.add(mongodbCredentials.getOptions().get("authenticationDatabase"));
            logger.debug("MongoDB 'authenticationDatabase' database parameter set to '{}'",
                    mongodbCredentials.getOptions().get("authenticationDatabase"));
        }
        args.add(database);
        args.add(indexFilePath.toString());

        ProcessBuilder processBuilder = new ProcessBuilder(args);
        logger.debug("Executing command: '{}'", StringUtils.join(processBuilder.command(), " "));

        //        processBuilder.redirectErrorStream(true);
        //        if (logFilePath != null) {
        //            processBuilder.redirectOutput(ProcessBuilder.Redirect.appendTo(new File(logFilePath)));
        //        }

        Process process = processBuilder.start();
        process.waitFor();

        // Check process output
        boolean executedWithoutErrors = true;
        int genomeInfoExitValue = process.exitValue();
        if (genomeInfoExitValue != 0) {
            logger.warn("Error executing {}, error code: {}", indexFilePath, genomeInfoExitValue);
            executedWithoutErrors = false;
        }
        return executedWithoutErrors;
    }

}