Java tutorial
/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.app.cli.analysis.executors; import com.beust.jcommander.ParameterException; import org.apache.commons.lang3.StringUtils; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.utils.FileUtils; import org.opencb.opencga.app.cli.analysis.options.VariantCommandOptions; import org.opencb.opencga.storage.core.manager.variant.VariantCatalogQueryUtils; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.*; import static org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat.VCF; /** * Created by imedina on 30/12/15. */ public class VariantQueryCommandUtils extends org.opencb.opencga.storage.app.cli.client.executors.VariantQueryCommandUtils { private static Logger logger = LoggerFactory .getLogger("org.opencb.opencga.storage.app.cli.client.VariantQueryCommandUtils"); public static Query parseQuery(VariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions, Map<Long, String> studyIds) throws Exception { VariantOutputFormat of = VariantWriterFactory .toOutputFormat(queryVariantsOptions.commonOptions.outputFormat, queryVariantsOptions.output); Query query = parseGenericVariantQuery(queryVariantsOptions.genericVariantQueryOptions, queryVariantsOptions.study, studyIds.values(), queryVariantsOptions.numericOptions.count, of); addParam(query, VariantCatalogQueryUtils.SAMPLE_FILTER, queryVariantsOptions.sampleFilter); return query; } @Deprecated public static Query oldParseQuery(VariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions, Map<Long, String> studyIds) throws Exception { Query query = new Query(); /* * Parse Variant parameters */ if (queryVariantsOptions.genericVariantQueryOptions.region != null && !queryVariantsOptions.genericVariantQueryOptions.region.isEmpty()) { query.put(REGION.key(), queryVariantsOptions.genericVariantQueryOptions.region); } else if (queryVariantsOptions.genericVariantQueryOptions.regionFile != null && !queryVariantsOptions.genericVariantQueryOptions.regionFile.isEmpty()) { Path gffPath = Paths.get(queryVariantsOptions.genericVariantQueryOptions.regionFile); FileUtils.checkFile(gffPath); String regionsFromFile = Files.readAllLines(gffPath).stream().map(line -> { String[] array = line.split("\t"); return new String(array[0].replace("chr", "") + ":" + array[3] + "-" + array[4]); }).collect(Collectors.joining(",")); query.put(REGION.key(), regionsFromFile); } addParam(query, ID, queryVariantsOptions.genericVariantQueryOptions.id); addParam(query, GENE, queryVariantsOptions.genericVariantQueryOptions.gene); addParam(query, TYPE, queryVariantsOptions.genericVariantQueryOptions.type); List studies = new LinkedList<>(); if (StringUtils.isNotEmpty(queryVariantsOptions.study)) { query.put(STUDIES.key(), queryVariantsOptions.study); for (String study : queryVariantsOptions.study.split(",|;")) { if (!study.startsWith("!")) { studies.add(study); } } } else { studies = new ArrayList<>(studyIds.keySet()); } // If the studies to be returned is empty then we return the studies being queried if (StringUtils.isNotEmpty(queryVariantsOptions.genericVariantQueryOptions.returnStudy)) { // query.put(RETURNED_STUDIES.key(), Arrays.asList(queryVariantCommandOptions.returnStudy.split(","))); List<String> list = new ArrayList<>(); Collections.addAll(list, queryVariantsOptions.genericVariantQueryOptions.returnStudy.split(",")); query.put(RETURNED_STUDIES.key(), list); } else { if (!studies.isEmpty()) { query.put(RETURNED_STUDIES.key(), studies); } } addParam(query, FILES, queryVariantsOptions.genericVariantQueryOptions.file); addParam(query, RETURNED_FILES, queryVariantsOptions.genericVariantQueryOptions.returnFile); addParam(query, FILTER, queryVariantsOptions.genericVariantQueryOptions.filter); addParam(query, GENOTYPE, queryVariantsOptions.genericVariantQueryOptions.sampleGenotype); addParam(query, SAMPLES, queryVariantsOptions.genericVariantQueryOptions.samples); addParam(query, VariantCatalogQueryUtils.SAMPLE_FILTER, queryVariantsOptions.sampleFilter); if (queryVariantsOptions.genericVariantQueryOptions.returnSample != null) { if (queryVariantsOptions.genericVariantQueryOptions.returnSample.isEmpty() || queryVariantsOptions.genericVariantQueryOptions.returnSample.equals(".")) { query.put(RETURNED_SAMPLES.key(), Collections.emptyList()); } else { query.put(RETURNED_SAMPLES.key(), queryVariantsOptions.genericVariantQueryOptions.returnSample); } } addParam(query, UNKNOWN_GENOTYPE, queryVariantsOptions.genericVariantQueryOptions.unknownGenotype); /** * Annotation parameters */ addParam(query, ANNOT_CONSEQUENCE_TYPE, queryVariantsOptions.genericVariantQueryOptions.consequenceType); addParam(query, ANNOT_BIOTYPE, queryVariantsOptions.genericVariantQueryOptions.geneBiotype); addParam(query, ANNOT_POPULATION_ALTERNATE_FREQUENCY, queryVariantsOptions.genericVariantQueryOptions.populationFreqs); addParam(query, ANNOT_POPULATION_MINOR_ALLELE_FREQUENCY, queryVariantsOptions.genericVariantQueryOptions.populationMaf); addParam(query, ANNOT_CONSERVATION, queryVariantsOptions.genericVariantQueryOptions.conservation); addParam(query, ANNOT_TRANSCRIPTION_FLAGS, queryVariantsOptions.genericVariantQueryOptions.flags); addParam(query, ANNOT_GENE_TRAITS_ID, queryVariantsOptions.genericVariantQueryOptions.geneTraitId); addParam(query, ANNOT_GENE_TRAITS_NAME, queryVariantsOptions.genericVariantQueryOptions.geneTraitName); addParam(query, ANNOT_HPO, queryVariantsOptions.genericVariantQueryOptions.hpo); addParam(query, ANNOT_GO, queryVariantsOptions.genericVariantQueryOptions.go); // addParam(query, ANNOT_EXPRESSION, queryVariantsOptions.genericVariantQueryOptions.expression); addParam(query, ANNOT_PROTEIN_KEYWORDS, queryVariantsOptions.genericVariantQueryOptions.proteinKeywords); addParam(query, ANNOT_DRUG, queryVariantsOptions.genericVariantQueryOptions.drugs); if (StringUtils.isNoneEmpty(queryVariantsOptions.genericVariantQueryOptions.proteinSubstitution)) { query.put(ANNOT_PROTEIN_SUBSTITUTION.key(), queryVariantsOptions.genericVariantQueryOptions.proteinSubstitution); } /* * Stats parameters */ if (queryVariantsOptions.genericVariantQueryOptions.stats != null && !queryVariantsOptions.genericVariantQueryOptions.stats.isEmpty()) { Set<String> acceptedStatKeys = new HashSet<>(Arrays.asList(STATS_MAF.key(), STATS_MGF.key(), MISSING_ALLELES.key(), MISSING_GENOTYPES.key())); for (String stat : queryVariantsOptions.genericVariantQueryOptions.stats.split(",")) { int index = stat.indexOf("<"); index = index >= 0 ? index : stat.indexOf("!"); index = index >= 0 ? index : stat.indexOf("~"); index = index >= 0 ? index : stat.indexOf("<"); index = index >= 0 ? index : stat.indexOf(">"); index = index >= 0 ? index : stat.indexOf("="); if (index < 0) { throw new UnsupportedOperationException("Unknown stat filter operation: " + stat); } String name = stat.substring(0, index); String cond = stat.substring(index); if (acceptedStatKeys.contains(name)) { query.put(name, cond); } else { throw new UnsupportedOperationException("Unknown stat filter name: " + name); } logger.info("Parsed stat filter: {} {}", name, cond); } } addParam(query, STATS_MAF, queryVariantsOptions.genericVariantQueryOptions.maf); addParam(query, STATS_MGF, queryVariantsOptions.genericVariantQueryOptions.mgf); addParam(query, MISSING_ALLELES, queryVariantsOptions.genericVariantQueryOptions.missingAlleleCount); addParam(query, MISSING_GENOTYPES, queryVariantsOptions.genericVariantQueryOptions.missingGenotypeCount); boolean returnVariants = !queryVariantsOptions.numericOptions.count && StringUtils.isEmpty(queryVariantsOptions.genericVariantQueryOptions.groupBy) && StringUtils.isEmpty(queryVariantsOptions.genericVariantQueryOptions.rank); VariantOutputFormat of = VCF; if (StringUtils.isNotEmpty(queryVariantsOptions.commonOptions.outputFormat)) { of = VariantWriterFactory.toOutputFormat(queryVariantsOptions.commonOptions.outputFormat, null); if (of == null) { throw variantFormatNotSupported(queryVariantsOptions.commonOptions.outputFormat); } } if (returnVariants && !of.isMultiStudyOutput()) { int returnedStudiesSize = query.getAsStringList(RETURNED_STUDIES.key()).size(); if (returnedStudiesSize == 0 && studies.size() == 1) { query.put(RETURNED_STUDIES.key(), studies.get(0)); } else if (returnedStudiesSize == 0 && studyIds.size() != 1 //If there are no returned studies, and there are more than one study || returnedStudiesSize > 1) { // Or is required more than one returned study throw new Exception("Only one study is allowed when returning " + of + ", please use '--return-study' to select the returned " + "study. Available studies: " + studyIds); } else { if (returnedStudiesSize == 0) { //If there were no returned studies, set the study existing one query.put(RETURNED_STUDIES.key(), studyIds.get(0)); } } } return query; } public static QueryOptions parseQueryOptions( VariantCommandOptions.VariantQueryCommandOptions queryVariantsOptions) { QueryOptions queryOptions = new QueryOptions(new HashMap<>(queryVariantsOptions.commonOptions.params)); if (StringUtils.isNotEmpty(queryVariantsOptions.dataModelOptions.include)) { queryOptions.add(QueryOptions.INCLUDE, queryVariantsOptions.dataModelOptions.include); } if (StringUtils.isNotEmpty(queryVariantsOptions.dataModelOptions.exclude)) { queryOptions.add(QueryOptions.EXCLUDE, queryVariantsOptions.dataModelOptions.exclude + ",_id"); } // else { // queryOptions.put("exclude", "_id"); // } if (queryVariantsOptions.numericOptions.skip > 0) { queryOptions.add(QueryOptions.SKIP, queryVariantsOptions.numericOptions.skip); } if (queryVariantsOptions.numericOptions.limit > 0) { queryOptions.add(QueryOptions.LIMIT, queryVariantsOptions.numericOptions.limit); } if (queryVariantsOptions.numericOptions.count) { queryOptions.add("count", true); } // if (queryVariantsOptions.numericOptions.sort) { // queryOptions.add(QueryOptions.SORT, true); // } return queryOptions; } public static ParameterException variantFormatNotSupported(String outputFormat) { logger.error("Format '{}' not supported", outputFormat); return new ParameterException("Format '" + outputFormat + "' not supported"); } }