org.opencb.opencga.app.cli.analysis.executors.VariantQueryCommandUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.opencga.app.cli.analysis.executors.VariantQueryCommandUtils.java

Source

/*
 * Copyright 2015-2016 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.opencga.app.cli.analysis.executors;

import com.beust.jcommander.ParameterException;
import org.apache.commons.lang3.StringUtils;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.utils.FileUtils;
import org.opencb.opencga.app.cli.analysis.options.VariantCommandOptions;
import org.opencb.opencga.storage.core.manager.variant.VariantCatalogQueryUtils;
import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory;
import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;

import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.*;
import static org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat.VCF;

/**
 * Created by imedina on 30/12/15.
 */
public class VariantQueryCommandUtils
        extends org.opencb.opencga.storage.app.cli.client.executors.VariantQueryCommandUtils {

    private static Logger logger = LoggerFactory
            .getLogger("org.opencb.opencga.storage.app.cli.client.VariantQueryCommandUtils");

    public static Query parseQuery(VariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions,
            Map<Long, String> studyIds) throws Exception {
        VariantOutputFormat of = VariantWriterFactory
                .toOutputFormat(queryVariantsOptions.commonOptions.outputFormat, queryVariantsOptions.output);
        Query query = parseGenericVariantQuery(queryVariantsOptions.genericVariantQueryOptions,
                queryVariantsOptions.study, studyIds.values(), queryVariantsOptions.numericOptions.count, of);

        addParam(query, VariantCatalogQueryUtils.SAMPLE_FILTER, queryVariantsOptions.sampleFilter);

        return query;
    }

    @Deprecated
    public static Query oldParseQuery(VariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions,
            Map<Long, String> studyIds) throws Exception {
        Query query = new Query();

        /*
         * Parse Variant parameters
         */
        if (queryVariantsOptions.genericVariantQueryOptions.region != null
                && !queryVariantsOptions.genericVariantQueryOptions.region.isEmpty()) {
            query.put(REGION.key(), queryVariantsOptions.genericVariantQueryOptions.region);
        } else if (queryVariantsOptions.genericVariantQueryOptions.regionFile != null
                && !queryVariantsOptions.genericVariantQueryOptions.regionFile.isEmpty()) {
            Path gffPath = Paths.get(queryVariantsOptions.genericVariantQueryOptions.regionFile);
            FileUtils.checkFile(gffPath);
            String regionsFromFile = Files.readAllLines(gffPath).stream().map(line -> {
                String[] array = line.split("\t");
                return new String(array[0].replace("chr", "") + ":" + array[3] + "-" + array[4]);
            }).collect(Collectors.joining(","));
            query.put(REGION.key(), regionsFromFile);
        }

        addParam(query, ID, queryVariantsOptions.genericVariantQueryOptions.id);
        addParam(query, GENE, queryVariantsOptions.genericVariantQueryOptions.gene);
        addParam(query, TYPE, queryVariantsOptions.genericVariantQueryOptions.type);

        List studies = new LinkedList<>();
        if (StringUtils.isNotEmpty(queryVariantsOptions.study)) {
            query.put(STUDIES.key(), queryVariantsOptions.study);
            for (String study : queryVariantsOptions.study.split(",|;")) {
                if (!study.startsWith("!")) {
                    studies.add(study);
                }
            }
        } else {
            studies = new ArrayList<>(studyIds.keySet());
        }

        // If the studies to be returned is empty then we return the studies being queried
        if (StringUtils.isNotEmpty(queryVariantsOptions.genericVariantQueryOptions.returnStudy)) {
            //            query.put(RETURNED_STUDIES.key(), Arrays.asList(queryVariantCommandOptions.returnStudy.split(",")));
            List<String> list = new ArrayList<>();
            Collections.addAll(list, queryVariantsOptions.genericVariantQueryOptions.returnStudy.split(","));
            query.put(RETURNED_STUDIES.key(), list);
        } else {
            if (!studies.isEmpty()) {
                query.put(RETURNED_STUDIES.key(), studies);
            }
        }

        addParam(query, FILES, queryVariantsOptions.genericVariantQueryOptions.file);
        addParam(query, RETURNED_FILES, queryVariantsOptions.genericVariantQueryOptions.returnFile);
        addParam(query, FILTER, queryVariantsOptions.genericVariantQueryOptions.filter);
        addParam(query, GENOTYPE, queryVariantsOptions.genericVariantQueryOptions.sampleGenotype);
        addParam(query, SAMPLES, queryVariantsOptions.genericVariantQueryOptions.samples);
        addParam(query, VariantCatalogQueryUtils.SAMPLE_FILTER, queryVariantsOptions.sampleFilter);
        if (queryVariantsOptions.genericVariantQueryOptions.returnSample != null) {
            if (queryVariantsOptions.genericVariantQueryOptions.returnSample.isEmpty()
                    || queryVariantsOptions.genericVariantQueryOptions.returnSample.equals(".")) {
                query.put(RETURNED_SAMPLES.key(), Collections.emptyList());
            } else {
                query.put(RETURNED_SAMPLES.key(), queryVariantsOptions.genericVariantQueryOptions.returnSample);
            }
        }
        addParam(query, UNKNOWN_GENOTYPE, queryVariantsOptions.genericVariantQueryOptions.unknownGenotype);

        /**
         * Annotation parameters
         */
        addParam(query, ANNOT_CONSEQUENCE_TYPE, queryVariantsOptions.genericVariantQueryOptions.consequenceType);
        addParam(query, ANNOT_BIOTYPE, queryVariantsOptions.genericVariantQueryOptions.geneBiotype);
        addParam(query, ANNOT_POPULATION_ALTERNATE_FREQUENCY,
                queryVariantsOptions.genericVariantQueryOptions.populationFreqs);
        addParam(query, ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY,
                queryVariantsOptions.genericVariantQueryOptions.populationMaf);
        addParam(query, ANNOT_CONSERVATION, queryVariantsOptions.genericVariantQueryOptions.conservation);
        addParam(query, ANNOT_TRANSCRIPTION_FLAGS, queryVariantsOptions.genericVariantQueryOptions.flags);
        addParam(query, ANNOT_GENE_TRAITS_ID, queryVariantsOptions.genericVariantQueryOptions.geneTraitId);
        addParam(query, ANNOT_GENE_TRAITS_NAME, queryVariantsOptions.genericVariantQueryOptions.geneTraitName);
        addParam(query, ANNOT_HPO, queryVariantsOptions.genericVariantQueryOptions.hpo);
        addParam(query, ANNOT_GO, queryVariantsOptions.genericVariantQueryOptions.go);
        //        addParam(query, ANNOT_EXPRESSION, queryVariantsOptions.genericVariantQueryOptions.expression);
        addParam(query, ANNOT_PROTEIN_KEYWORDS, queryVariantsOptions.genericVariantQueryOptions.proteinKeywords);
        addParam(query, ANNOT_DRUG, queryVariantsOptions.genericVariantQueryOptions.drugs);

        if (StringUtils.isNoneEmpty(queryVariantsOptions.genericVariantQueryOptions.proteinSubstitution)) {
            query.put(ANNOT_PROTEIN_SUBSTITUTION.key(),
                    queryVariantsOptions.genericVariantQueryOptions.proteinSubstitution);
        }

        /*
         * Stats parameters
         */
        if (queryVariantsOptions.genericVariantQueryOptions.stats != null
                && !queryVariantsOptions.genericVariantQueryOptions.stats.isEmpty()) {
            Set<String> acceptedStatKeys = new HashSet<>(Arrays.asList(STATS_MAF.key(), STATS_MGF.key(),
                    MISSING_ALLELES.key(), MISSING_GENOTYPES.key()));

            for (String stat : queryVariantsOptions.genericVariantQueryOptions.stats.split(",")) {
                int index = stat.indexOf("<");
                index = index >= 0 ? index : stat.indexOf("!");
                index = index >= 0 ? index : stat.indexOf("~");
                index = index >= 0 ? index : stat.indexOf("<");
                index = index >= 0 ? index : stat.indexOf(">");
                index = index >= 0 ? index : stat.indexOf("=");
                if (index < 0) {
                    throw new UnsupportedOperationException("Unknown stat filter operation: " + stat);
                }
                String name = stat.substring(0, index);
                String cond = stat.substring(index);

                if (acceptedStatKeys.contains(name)) {
                    query.put(name, cond);
                } else {
                    throw new UnsupportedOperationException("Unknown stat filter name: " + name);
                }
                logger.info("Parsed stat filter: {} {}", name, cond);
            }
        }

        addParam(query, STATS_MAF, queryVariantsOptions.genericVariantQueryOptions.maf);
        addParam(query, STATS_MGF, queryVariantsOptions.genericVariantQueryOptions.mgf);
        addParam(query, MISSING_ALLELES, queryVariantsOptions.genericVariantQueryOptions.missingAlleleCount);
        addParam(query, MISSING_GENOTYPES, queryVariantsOptions.genericVariantQueryOptions.missingGenotypeCount);

        boolean returnVariants = !queryVariantsOptions.numericOptions.count
                && StringUtils.isEmpty(queryVariantsOptions.genericVariantQueryOptions.groupBy)
                && StringUtils.isEmpty(queryVariantsOptions.genericVariantQueryOptions.rank);

        VariantOutputFormat of = VCF;
        if (StringUtils.isNotEmpty(queryVariantsOptions.commonOptions.outputFormat)) {
            of = VariantWriterFactory.toOutputFormat(queryVariantsOptions.commonOptions.outputFormat, null);
            if (of == null) {
                throw variantFormatNotSupported(queryVariantsOptions.commonOptions.outputFormat);
            }
        }

        if (returnVariants && !of.isMultiStudyOutput()) {
            int returnedStudiesSize = query.getAsStringList(RETURNED_STUDIES.key()).size();
            if (returnedStudiesSize == 0 && studies.size() == 1) {
                query.put(RETURNED_STUDIES.key(), studies.get(0));
            } else if (returnedStudiesSize == 0 && studyIds.size() != 1 //If there are no returned studies, and there are more than one study
                    || returnedStudiesSize > 1) { // Or is required more than one returned study
                throw new Exception("Only one study is allowed when returning " + of
                        + ", please use '--return-study' to select the returned " + "study. Available studies: "
                        + studyIds);
            } else {
                if (returnedStudiesSize == 0) { //If there were no returned studies, set the study existing one
                    query.put(RETURNED_STUDIES.key(), studyIds.get(0));
                }
            }
        }

        return query;
    }

    public static QueryOptions parseQueryOptions(
            VariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions) {
        QueryOptions queryOptions = new QueryOptions(new HashMap<>(queryVariantsOptions.commonOptions.params));

        if (StringUtils.isNotEmpty(queryVariantsOptions.dataModelOptions.include)) {
            queryOptions.add(QueryOptions.INCLUDE, queryVariantsOptions.dataModelOptions.include);
        }

        if (StringUtils.isNotEmpty(queryVariantsOptions.dataModelOptions.exclude)) {
            queryOptions.add(QueryOptions.EXCLUDE, queryVariantsOptions.dataModelOptions.exclude + ",_id");
        }
        //        else {
        //            queryOptions.put("exclude", "_id");
        //        }

        if (queryVariantsOptions.numericOptions.skip > 0) {
            queryOptions.add(QueryOptions.SKIP, queryVariantsOptions.numericOptions.skip);
        }

        if (queryVariantsOptions.numericOptions.limit > 0) {
            queryOptions.add(QueryOptions.LIMIT, queryVariantsOptions.numericOptions.limit);
        }

        if (queryVariantsOptions.numericOptions.count) {
            queryOptions.add("count", true);
        }

        //        if (queryVariantsOptions.numericOptions.sort) {
        //            queryOptions.add(QueryOptions.SORT, true);
        //        }

        return queryOptions;
    }

    public static ParameterException variantFormatNotSupported(String outputFormat) {
        logger.error("Format '{}' not supported", outputFormat);
        return new ParameterException("Format '" + outputFormat + "' not supported");
    }

}