org.opencb.cellbase.lib.impl.VariantMongoDBAdaptor.java Source code

Introduction

Here is the source code for org.opencb.cellbase.lib.impl.VariantMongoDBAdaptor.java
Source

/*
 * Copyright 2015 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.impl;

import com.mongodb.BulkWriteException;
import com.mongodb.MongoClient;
import com.mongodb.QueryBuilder;
import com.mongodb.bulk.BulkWriteResult;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.Projections;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.Score;
import org.opencb.cellbase.core.api.VariantDBAdaptor;
import org.opencb.cellbase.lib.MongoDBCollectionConfiguration;
import org.opencb.cellbase.lib.VariantMongoIterator;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.datastore.mongodb.MongoDBCollection;
import org.opencb.commons.datastore.mongodb.MongoDataStore;

import java.util.*;
import java.util.function.Consumer;
import java.util.regex.Pattern;

/**
 * Created by imedina on 26/11/15.
 */
public class VariantMongoDBAdaptor extends MongoDBAdaptor implements VariantDBAdaptor<Variant> {

    private static final String POP_FREQUENCIES_FIELD = "annotation.populationFrequencies";
    private static final String ANNOTATION_FIELD = "annotation";
    private static final float DECIMAL_RESOLUTION = 100f;
    private static final String ENSEMBL_GENE_ID_PATTERN = "ENSG00";
    private static final String ENSEMBL_TRANSCRIPT_ID_PATTERN = "ENST00";

    private MongoDBCollection caddDBCollection;

    public VariantMongoDBAdaptor(String species, String assembly, MongoDataStore mongoDataStore) {
        super(species, assembly, mongoDataStore);
        mongoDBCollection = mongoDataStore.getCollection("variation");
        caddDBCollection = mongoDataStore.getCollection("variation_functional_score");

        logger.debug("VariationMongoDBAdaptor: in 'constructor'");
    }

    @Override
    public QueryResult startsWith(String id, QueryOptions options) {
        Bson regex = Filters.regex("ids", Pattern.compile("^" + id));
        Bson include = Projections.include("ids", "chromosome", "start", "end");
        return mongoDBCollection.find(regex, include, options);
    }

    @Override
    public QueryResult<Variant> next(Query query, QueryOptions options) {
        return null;
    }

    @Override
    public QueryResult nativeNext(Query query, QueryOptions options) {
        return null;
    }

    @Override
    public QueryResult getIntervalFrequencies(Query query, int intervalSize, QueryOptions options) {
        if (query.getString(QueryParams.REGION.key()) != null) {
            Region region = Region.parseRegion(query.getString(QueryParams.REGION.key()));
            Bson bsonDocument = parseQuery(query);
            return getIntervalFrequencies(bsonDocument, region, intervalSize, options);
        }
        return null;
    }

    @Override
    public QueryResult<Long> update(List objectList, String field, String[] innerFields) {
        QueryResult<Long> nLoadedObjects = null;
        switch (field) {
        case POP_FREQUENCIES_FIELD:
            nLoadedObjects = updatePopulationFrequencies((List<Document>) objectList);
            break;
        case ANNOTATION_FIELD:
            nLoadedObjects = updateAnnotation((List<Document>) objectList, innerFields);
            break;
        default:
            logger.error("Invalid field {}: no action implemented for updating this field.", field);
            break;
        }
        return nLoadedObjects;
    }

    @Override
    public QueryResult<Long> count(Query query) {
        Bson document = parseQuery(query);
        return mongoDBCollection.count(document);
    }

    @Override
    public QueryResult distinct(Query query, String field) {
        Bson document = parseQuery(query);
        return mongoDBCollection.distinct(field, document);
    }

    @Override
    public QueryResult stats(Query query) {
        return null;
    }

    @Override
    public QueryResult<Variant> get(Query query, QueryOptions options) {
        Bson bson = parseQuery(query);
        //        options.put(MongoDBCollection.SKIP_COUNT, true);

        // FIXME: patch to exclude annotation.additionalAttributes from the results - restore the call to the common
        // FIXME: addPrivateExcludeOptions as soon as the variation collection is updated with the new form of the
        // FIXME: additionalAttributes field
        options = addVariantPrivateExcludeOptions(options);
        //        options = addPrivateExcludeOptions(options);

        logger.debug("query: {}",
                bson.toBsonDocument(Document.class, MongoClient.getDefaultCodecRegistry()).toJson());
        return mongoDBCollection.find(bson, null, Variant.class, options);
    }

    // FIXME: patch to exclude annotation.additionalAttributes from the results - to remove as soon as the variation
    // FIXME: collection is updated with the new form of the additionalAttributes field
    protected QueryOptions addVariantPrivateExcludeOptions(QueryOptions options) {
        if (options != null) {
            if (options.get("exclude") == null) {
                options.put("exclude", "_id,_chunkIds,annotation.additionalAttributes");
            } else {
                String exclude = options.getString("exclude");
                options.put("exclude", exclude + ",_id,_chunkIds,annotation.additionalAttributes");
            }
        } else {
            options = new QueryOptions("exclude", "_id,_chunkIds,annotation.additionalAttributes");
        }
        return options;
    }

    @Override
    public QueryResult nativeGet(Query query, QueryOptions options) {
        Bson bson = parseQuery(query);
        //        options.put(MongoDBCollection.SKIP_COUNT, true);
        logger.debug("query: {}",
                bson.toBsonDocument(Document.class, MongoClient.getDefaultCodecRegistry()).toJson());
        return mongoDBCollection.find(bson, options);
    }

    @Override
    public Iterator<Variant> iterator(Query query, QueryOptions options) {
        Bson bson = parseQuery(query);
        options = addPrivateExcludeOptions(options);
        return new VariantMongoIterator(mongoDBCollection.nativeQuery().find(bson, options).iterator());
    }

    @Override
    public Iterator nativeIterator(Query query, QueryOptions options) {
        Bson bson = parseQuery(query);
        return mongoDBCollection.nativeQuery().find(bson, options).iterator();
    }

    @Override
    public void forEach(Query query, Consumer<? super Object> action, QueryOptions options) {
        Objects.requireNonNull(action);
        Iterator iterator = nativeIterator(query, options);
        while (iterator.hasNext()) {
            action.accept(iterator.next());
        }
    }

    @Override
    public QueryResult rank(Query query, String field, int numResults, boolean asc) {
        return null;
    }

    @Override
    public QueryResult groupBy(Query query, String field, QueryOptions options) {
        Bson bsonQuery = parseQuery(query);
        return groupBy(bsonQuery, field, "name", options);
    }

    @Override
    public QueryResult groupBy(Query query, List<String> fields, QueryOptions options) {
        Bson bsonQuery = parseQuery(query);
        return groupBy(bsonQuery, fields, "name", options);
    }

    private Bson parseQuery(Query query) {
        List<Bson> andBsonList = new ArrayList<>();

        createRegionQuery(query, VariantMongoDBAdaptor.QueryParams.REGION.key(),
                MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE, andBsonList);
        createOrQuery(query, VariantMongoDBAdaptor.QueryParams.ID.key(), "id", andBsonList);
        createOrQuery(query, QueryParams.CHROMOSOME.key(), "chromosome", andBsonList);
        createImprecisePositionQuery(query, QueryParams.CI_START_LEFT.key(), QueryParams.CI_START_RIGHT.key(),
                "sv.ciStartLeft", "sv.ciStartRight", andBsonList);
        createImprecisePositionQuery(query, QueryParams.CI_END_LEFT.key(), QueryParams.CI_END_RIGHT.key(),
                "sv.ciEndLeft", "sv.ciEndRight", andBsonList);
        createOrQuery(query, QueryParams.START.key(), "start", andBsonList, QueryValueType.INTEGER);
        //        createOrQuery(query, QueryParams.REFERENCE.key(), "reference", andBsonList);
        if (query.containsKey(QueryParams.REFERENCE.key())) {
            createOrQuery(query.getAsStringList(QueryParams.REFERENCE.key()), "reference", andBsonList);
        }
        if (query.containsKey(QueryParams.ALTERNATE.key())) {
            createOrQuery(query.getAsStringList(QueryParams.ALTERNATE.key()), "alternate", andBsonList);
        }
        //        createOrQuery(query, QueryParams.ALTERNATE.key(), "alternate", andBsonList);
        createOrQuery(query, VariantMongoDBAdaptor.QueryParams.CONSEQUENCE_TYPE.key(),
                "annotation.consequenceTypes.sequenceOntologyTerms.name", andBsonList);
        //        createOrQuery(query, VariantMongoDBAdaptor.QueryParams.GENE.key(), "annotation.consequenceTypes.ensemblGeneId",
        //                andBsonList);
        createGeneOrQuery(query, VariantMongoDBAdaptor.QueryParams.GENE.key(), andBsonList);

        //        createOrQuery(query, VariantMongoDBAdaptor.QueryParams.XREFS.key(), "transcripts.xrefs.id", andBsonList);

        if (andBsonList.size() > 0) {
            return Filters.and(andBsonList);
        } else {
            return new Document();
        }
    }

    private void createImprecisePositionQuery(Query query, String leftQueryParam, String rightQueryParam,
            String leftLimitMongoField, String righLimitMongoField, List<Bson> andBsonList) {
        if (query != null && query.getString(leftQueryParam) != null && !query.getString(leftQueryParam).isEmpty()
                && query.getString(rightQueryParam) != null && !query.getString(rightQueryParam).isEmpty()) {
            int leftQueryValue = query.getInt(leftQueryParam);
            int rightQueryValue = query.getInt(rightQueryParam);
            andBsonList.add(Filters.lte(leftLimitMongoField, rightQueryValue));
            andBsonList.add(Filters.gte(righLimitMongoField, leftQueryValue));
        }
    }

    //    private Bson getPositionWithinIntervalQuery(int value, String leftLimitMongoField,
    //                                                String righLimitMongoField) {
    //        List<Bson> andBsonList = new ArrayList<>(2);
    //        andBsonList.add(Filters.lte(leftLimitMongoField, value));
    //        andBsonList.add(Filters.gte(righLimitMongoField, value));
    //
    //        return Filters.and(andBsonList);
    //    }
    private void createGeneOrQuery(Query query, String queryParam, List<Bson> andBsonList) {
        if (query != null) {
            List<String> geneList = query.getAsStringList(queryParam);
            if (geneList != null && !geneList.isEmpty()) {
                if (geneList.size() == 1) {
                    andBsonList.add(getGeneQuery(geneList.get(0)));
                } else {
                    List<Bson> orBsonList = new ArrayList<>(geneList.size());
                    for (String geneId : geneList) {
                        orBsonList.add(getGeneQuery(geneId));
                    }
                    andBsonList.add(Filters.or(orBsonList));
                }
            }
        }
    }

    private Bson getGeneQuery(String geneId) {
        //        List<Bson> orBsonList = new ArrayList<>(3);
        //        orBsonList.add(Filters.eq("annotation.consequenceTypes.geneName", geneId));
        //        orBsonList.add(Filters.eq("annotation.consequenceTypes.ensemblGeneId", geneId));
        //        orBsonList.add(Filters.eq("annotation.consequenceTypes.ensemblTranscriptId", geneId));

        // For some reason Mongo does not deal properly with OR queries and indexes. It is extremely slow to perform
        // the commented query above. On the contrary this query below provides instant results
        if (geneId.startsWith(ENSEMBL_GENE_ID_PATTERN)) {
            return Filters.eq("annotation.consequenceTypes.ensemblGeneId", geneId);
        } else if (geneId.startsWith(ENSEMBL_TRANSCRIPT_ID_PATTERN)) {
            return Filters.eq("annotation.consequenceTypes.ensemblTranscriptId", geneId);
        } else {
            return Filters.eq("annotation.consequenceTypes.geneName", geneId);
        }
    }

    private QueryResult<Long> updateAnnotation(List<Document> variantDocumentList, String[] innerFields) {
        List<Bson> queries = new ArrayList<>(variantDocumentList.size());
        List<Bson> updates = new ArrayList<>(variantDocumentList.size());

        for (Document variantDBObject : variantDocumentList) {
            Document annotationDBObject = (Document) variantDBObject.get(ANNOTATION_FIELD);
            Document toOverwrite = new Document();
            if (innerFields != null & innerFields.length > 0) {
                for (String field : innerFields) {
                    if (annotationDBObject.get(field) != null) {
                        toOverwrite.put(ANNOTATION_FIELD + "." + field, annotationDBObject.get(field));
                    }
                }
            } else {
                toOverwrite.put(ANNOTATION_FIELD, annotationDBObject);
            }

            Document update = new Document().append("$set", toOverwrite);
            updates.add(update);

            String chunkId = getChunkIdPrefix((String) variantDBObject.get("chromosome"),
                    (int) variantDBObject.get("start"), MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE);
            queries.add(new Document("_chunkIds", chunkId).append("chromosome", variantDBObject.get("chromosome"))
                    .append("start", variantDBObject.get("start"))
                    //                    .append("end", variantDBObject.get("end"))
                    .append("reference", variantDBObject.get("reference"))
                    .append("alternate", variantDBObject.get("alternate")));
        }

        QueryResult<BulkWriteResult> bulkWriteResult;
        if (!queries.isEmpty()) {
            logger.info("updating object");
            QueryOptions options = new QueryOptions("upsert", false);
            options.put("multi", false);
            try {
                bulkWriteResult = mongoDBCollection.update(queries, updates, options);
            } catch (BulkWriteException e) {
                throw e;
            }
            logger.info("{} object updated", bulkWriteResult.first().getModifiedCount());

            QueryResult<Long> longQueryResult = new QueryResult<>(bulkWriteResult.getId(),
                    bulkWriteResult.getDbTime(), bulkWriteResult.getNumResults(),
                    bulkWriteResult.getNumTotalResults(), bulkWriteResult.getWarningMsg(),
                    bulkWriteResult.getErrorMsg(),
                    Collections.singletonList((long) (bulkWriteResult.first().getUpserts().size()
                            + bulkWriteResult.first().getModifiedCount())));
            return longQueryResult;
        }
        logger.info("no object updated");
        return null;

    }

    private QueryResult<Long> updatePopulationFrequencies(List<Document> variantDocumentList) {

        List<Bson> queries = new ArrayList<>(variantDocumentList.size());
        List<Bson> updates = new ArrayList<>(variantDocumentList.size());
        //        QueryResult<Long> longQueryResult = null;

        for (Document variantDBObject : variantDocumentList) {
            Document annotationDBObject = (Document) variantDBObject.get(ANNOTATION_FIELD);
            Document push = new Document(POP_FREQUENCIES_FIELD, annotationDBObject.get("populationFrequencies"));

            // Remove annotation object from the DBObject so that push and setOnInsert do not update the same fields:
            // i.e. annotation.populationFrequencies and annotation
            variantDBObject.remove(ANNOTATION_FIELD);
            addChunkId(variantDBObject);

            Document update = new Document().append("$pushAll", push).append("$setOnInsert", variantDBObject);

            updates.add(update);

            //            String chunkId = getChunkIdPrefix((String) variantDBObject.get("chromosome"),
            //                    (int) variantDBObject.get("start"), MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE);
            //            queries.add(new Document("_chunkIds", chunkId)
            //                    .append("chromosome", variantDBObject.get("chromosome"))
            queries.add(new Document("chromosome", variantDBObject.get("chromosome"))
                    .append("start", variantDBObject.get("start"))
                    //                    .append("end", variantDBObject.get("end"))
                    .append("reference", variantDBObject.get("reference"))
                    .append("alternate", variantDBObject.get("alternate")));
        }

        QueryResult<BulkWriteResult> bulkWriteResult;
        if (!queries.isEmpty()) {
            logger.info("updating object");
            QueryOptions options = new QueryOptions("upsert", true);
            options.put("multi", false);
            try {
                bulkWriteResult = mongoDBCollection.update(queries, updates, options);
            } catch (BulkWriteException e) {
                throw e;
            }
            logger.info("{} object updated",
                    bulkWriteResult.first().getUpserts().size() + bulkWriteResult.first().getModifiedCount());

            QueryResult<Long> longQueryResult = new QueryResult<>(bulkWriteResult.getId(),
                    bulkWriteResult.getDbTime(), bulkWriteResult.getNumResults(),
                    bulkWriteResult.getNumTotalResults(), bulkWriteResult.getWarningMsg(),
                    bulkWriteResult.getErrorMsg(),
                    Collections.singletonList((long) (bulkWriteResult.first().getUpserts().size()
                            + bulkWriteResult.first().getModifiedCount())));

            //            return bulkWriteResult.first().getUpserts().size() + bulkWriteResult.first().getModifiedCount();
            return longQueryResult;
        }
        logger.info("no object updated");
        return null;
    }

    // Method copied from MongoDBCellbaseLoader. In a near future only this one will stay. Insert work currently done
    // by MongoDBCellbaseLoader must be replaced by an appropriate method in this adaptor
    private void addChunkId(Document dbObject) {
        List<String> chunkIds = new ArrayList<>();
        int chunkStart = (Integer) dbObject.get("start") / MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE;
        int chunkEnd = (Integer) dbObject.get("end") / MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE;
        String chunkIdSuffix = MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE / 1000 + "k";
        for (int i = chunkStart; i <= chunkEnd; i++) {
            if (dbObject.containsKey("chromosome")) {
                chunkIds.add(dbObject.get("chromosome") + "_" + i + "_" + chunkIdSuffix);
            } else {
                chunkIds.add(dbObject.get("sequenceName") + "_" + i + "_" + chunkIdSuffix);
            }
        }
        dbObject.put("_chunkIds", chunkIds);
    }

    @Override
    public QueryResult<Score> getFunctionalScoreVariant(Variant variant, QueryOptions queryOptions) {
        String chromosome = variant.getChromosome();
        int position = variant.getStart();
        String reference = variant.getReference();
        String alternate = variant.getAlternate();

        String chunkId = getChunkIdPrefix(chromosome, position,
                MongoDBCollectionConfiguration.VARIATION_FUNCTIONAL_SCORE_CHUNK_SIZE);
        QueryBuilder builder = QueryBuilder.start("_chunkIds").is(chunkId);
        //                .and("chromosome").is(chromosome)
        //                .and("start").is(position);
        //        System.out.println(chunkId);
        QueryResult result = executeQuery(chromosome + "_" + position + "_" + reference + "_" + alternate,
                new Document(builder.get().toMap()), queryOptions, caddDBCollection);

        //        System.out.println("result = " + result);

        List<Score> scores = new ArrayList<>();
        for (Object object : result.getResult()) {
            //            System.out.println("object = " + object);
            Document dbObject = (Document) object;
            int chunkStart = dbObject.getInteger("start");
            int chunkEnd = dbObject.getInteger("end");
            // CADD positions are not continuous through the whole chromosome. Several documents may be associated with
            // the same chunk id: we have to be sure that current document contains queried position. Only two documents
            // will contain queried position - one for raw and one for scaled values
            if (position >= chunkStart && position <= chunkEnd) {
                int offset = (position - chunkStart);
                ArrayList basicDBList = dbObject.get("values", ArrayList.class);

                //                long l1 = 0L; // TODO: delete
                //                try { // TODO: delete
                long l1 = Long.parseLong(basicDBList.get(offset).toString());
                //                                 l1 = (Long) basicDBList.get(offset);
                //                } catch (Exception e) {  // TODO: delete
                //                    logger.error("problematic variant: {}", variant.toString());
                //                    throw e;
                //                }

                if (dbObject.getString("source").equalsIgnoreCase("cadd_raw")) {
                    float value = 0f;
                    switch (alternate.toLowerCase()) {
                    case "a":
                        //                            value = ((short) (l1 >> 48) - 10000) / DECIMAL_RESOLUTION;
                        value = (((short) (l1 >> 48)) / DECIMAL_RESOLUTION) - 10;
                        break;
                    case "c":
                        value = (((short) (l1 >> 32)) / DECIMAL_RESOLUTION) - 10;
                        break;
                    case "g":
                        value = (((short) (l1 >> 16)) / DECIMAL_RESOLUTION) - 10;
                        break;
                    case "t":
                        value = (((short) (l1 >> 0)) / DECIMAL_RESOLUTION) - 10;
                        break;
                    default:
                        break;
                    }
                    scores.add(Score.newBuilder().setScore(value).setSource(dbObject.getString("source"))
                            .setDescription(null)
                            //                        .setDescription("")
                            .build());
                }

                if (dbObject.getString("source").equalsIgnoreCase("cadd_scaled")) {
                    float value = 0f;
                    switch (alternate.toLowerCase()) {
                    case "a":
                        value = ((short) (l1 >> 48)) / DECIMAL_RESOLUTION;
                        break;
                    case "c":
                        value = ((short) (l1 >> 32)) / DECIMAL_RESOLUTION;
                        break;
                    case "g":
                        value = ((short) (l1 >> 16)) / DECIMAL_RESOLUTION;
                        break;
                    case "t":
                        value = ((short) (l1 >> 0)) / DECIMAL_RESOLUTION;
                        break;
                    default:
                        break;
                    }
                    scores.add(Score.newBuilder().setScore(value).setSource(dbObject.getString("source"))
                            .setDescription(null)
                            //                        .setDescription("")
                            .build());
                }
            }
        }

        result.setResult(scores);
        return result;
    }
}