Java tutorial
/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package; import com.beust.jcommander.ParameterException; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderVersion; import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.lang3.tuple.Pair; import org.bson.Document; import; import; import; import; import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantNormalizer; import org.opencb.biodata.models.variant.avro.VariantAnnotation; import org.opencb.biodata.models.variant.avro.VariantAvro; import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import; import; import; import; import; import org.opencb.cellbase.core.api.DBAdaptorFactory; import org.opencb.cellbase.core.api.GenomeDBAdaptor; import org.opencb.cellbase.core.client.CellBaseClient; import org.opencb.cellbase.core.variant.annotation.*; import org.opencb.cellbase.lib.impl.MongoDBAdaptorFactory; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryResult; import; import; import; import; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; import*; import; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import; import static java.nio.file.StandardOpenOption.APPEND; /** * Created by fjlopez on 18/03/15. */ public class VariantAnnotationCommandExecutor extends CommandExecutor { public enum FileFormat { VCF, JSON, VEP }; private CliOptionsParser.VariantAnnotationCommandOptions variantAnnotationCommandOptions; private Path input; private Path output; private String url; private boolean local; private boolean cellBaseAnnotation; private boolean benchmark; private Path referenceFasta; private boolean normalize; private List<String> chromosomeList; private int port; private String species; private String assembly; private int numThreads; private int batchSize; private List<Path> customFiles; private Path populationFrequenciesFile = null; private List<RocksDB> dbIndexes; private List<Options> dbOptions; private List<String> dbLocations; private List<String> customFileIds; private List<List<String>> customFileFields; private FileFormat inputFormat; private FileFormat outputFormat; private QueryOptions queryOptions; private DBAdaptorFactory dbAdaptorFactory = null; private final int QUEUE_CAPACITY = 10; private final String TMP_DIR = "/tmp/"; private static final String VARIATION_ANNOTATION_FILE_PREFIX = "variation_annotation_"; public VariantAnnotationCommandExecutor( CliOptionsParser.VariantAnnotationCommandOptions variantAnnotationCommandOptions) { super(variantAnnotationCommandOptions.commonOptions.logLevel, variantAnnotationCommandOptions.commonOptions.verbose, variantAnnotationCommandOptions.commonOptions.conf); this.variantAnnotationCommandOptions = variantAnnotationCommandOptions; this.queryOptions = new QueryOptions(); if (variantAnnotationCommandOptions.input != null) { input = Paths.get(variantAnnotationCommandOptions.input); } if (variantAnnotationCommandOptions.output != null) { output = Paths.get(variantAnnotationCommandOptions.output); } } @Override public void execute() { try { checkParameters(); if (benchmark) { runBenchmark(); } else { runAnnotation(); }"Finished"); } catch (Exception e) { e.printStackTrace(); } } private void runBenchmark() { try { FastaIndexManager fastaIndexManager = getFastaIndexManger(); DirectoryStream<Path> stream = Files.newDirectoryStream(input, entry -> { return entry.getFileName().toString().endsWith(".vep"); }); DataWriter dataWriter = getDataWriter(output.toString()); ParallelTaskRunner.Config config = new ParallelTaskRunner.Config(numThreads, batchSize, QUEUE_CAPACITY, false); List<ParallelTaskRunner.TaskWithException<VariantAnnotation, Pair<VariantAnnotationDiff, VariantAnnotationDiff>, Exception>> variantAnnotatorTaskList = getBenchmarkTaskList( fastaIndexManager); for (Path entry : stream) {"Processing file '{}'", entry.toString()); DataReader dataReader = new VepFormatReader(input.resolve(entry.getFileName()).toString()); ParallelTaskRunner<VariantAnnotation, Pair<VariantAnnotationDiff, VariantAnnotationDiff>> runner = new ParallelTaskRunner<>( dataReader, variantAnnotatorTaskList, dataWriter, config);; } } catch (Exception e) { e.printStackTrace(); } } private FastaIndexManager getFastaIndexManger() { // Preparing the fasta file for fast accessing FastaIndexManager fastaIndexManager = null; try { fastaIndexManager = new FastaIndexManager(referenceFasta, true); if (!fastaIndexManager.isConnected()) { fastaIndexManager.index(); } } catch (Exception e) { e.printStackTrace(); } return fastaIndexManager; } private List<ParallelTaskRunner.TaskWithException<VariantAnnotation, Pair<VariantAnnotationDiff, VariantAnnotationDiff>, Exception>> getBenchmarkTaskList( FastaIndexManager fastaIndexManager) throws IOException { List<ParallelTaskRunner.TaskWithException<VariantAnnotation, Pair<VariantAnnotationDiff, VariantAnnotationDiff>, Exception>> benchmarkTaskList = new ArrayList<>( numThreads); for (int i = 0; i < numThreads; i++) { // Benchmark variants are read from a VEP file, must not normalize benchmarkTaskList.add(new BenchmarkTask(createCellBaseAnnotator(), fastaIndexManager)); } return benchmarkTaskList; } private boolean runAnnotation() throws Exception { // Build indexes for custom files and/or population frequencies file getIndexes(); if (variantAnnotationCommandOptions.variant != null && !variantAnnotationCommandOptions.variant.isEmpty()) { List<Variant> variants = Variant.parseVariants(variantAnnotationCommandOptions.variant); if (local) { DBAdaptorFactory dbAdaptorFactory = new MongoDBAdaptorFactory(configuration); VariantAnnotationCalculator variantAnnotationCalculator = new VariantAnnotationCalculator( this.species, this.assembly, dbAdaptorFactory); List<QueryResult<VariantAnnotation>> annotationByVariantList = variantAnnotationCalculator .getAnnotationByVariantList(variants, queryOptions); ObjectMapper jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter objectWriter = jsonObjectMapper.writer(); Path outPath = Paths.get(variantAnnotationCommandOptions.output); FileUtils.checkDirectory(outPath.getParent()); BufferedWriter bufferedWriter = FileUtils.newBufferedWriter(outPath); for (QueryResult queryResult : annotationByVariantList) { bufferedWriter.write(objectWriter.writeValueAsString(queryResult.getResult())); bufferedWriter.newLine(); } bufferedWriter.close(); } return true; } // If a variant file is provided then we annotate it. Lines in the input file can be computationally // expensive to parse, i.e.: multisample vcf with thousands of samples. A specific task is created to enable // parallel parsing of these lines if (input != null) { DataReader dataReader = new StringDataReader(input); List<ParallelTaskRunner.TaskWithException<String, Variant, Exception>> variantAnnotatorTaskList = getStringTaskList(); DataWriter dataWriter = getDataWriter(output.toString()); ParallelTaskRunner.Config config = new ParallelTaskRunner.Config(numThreads, batchSize, QUEUE_CAPACITY, false); ParallelTaskRunner<String, Variant> runner = new ParallelTaskRunner<>(dataReader, variantAnnotatorTaskList, dataWriter, config);; // For internal use only - will only be run when -Dpopulation-frequencies is activated writeRemainingPopFrequencies(); } else { // This will annotate the CellBase Variation collection if (cellBaseAnnotation) { // TODO: enable this query in the parseQuery method within VariantMongoDBAdaptor // Query query = new Query("$match", // new Document("annotation.consequenceTypes", new Document("$exists", 0))); // Query query = new Query(); QueryOptions options = new QueryOptions("include", "chromosome,start,reference,alternate,type"); List<ParallelTaskRunner.TaskWithException<Variant, Variant, Exception>> variantAnnotatorTaskList = getVariantTaskList(); ParallelTaskRunner.Config config = new ParallelTaskRunner.Config(numThreads, batchSize, QUEUE_CAPACITY, false); for (String chromosome : chromosomeList) {"Annotating chromosome {}", chromosome); Query query = new Query("chromosome", chromosome); DataReader dataReader = new VariationDataReader(dbAdaptorFactory.getVariationDBAdaptor(species), query, options); DataWriter dataWriter = getDataWriter( output.toString() + "/" + VARIATION_ANNOTATION_FILE_PREFIX + chromosome + ".json.gz"); ParallelTaskRunner<Variant, Variant> runner = new ParallelTaskRunner<Variant, Variant>( dataReader, variantAnnotatorTaskList, dataWriter, config);; } } } if (customFiles != null || populationFrequenciesFile != null) { closeIndexes(); }"Variant annotation finished."); return false; } private void writeRemainingPopFrequencies() throws IOException { // For internal use only - will only be run when -Dpopulation-frequencies is activated if (populationFrequenciesFile != null) { DataWriter dataWriter = new JsonAnnotationWriter(output.toString(), APPEND);; dataWriter.pre(); // Population frequencies rocks db will always be the last one in the list. DO NOT change the name of the // rocksIterator variable - for some unexplainable reason Java VM crashes if it's named "iterator" RocksIterator rocksIterator = dbIndexes.get(dbIndexes.size() - 1).newIterator(); ObjectMapper mapper = new ObjectMapper();"Writing variants with frequencies that were not found within the input file to {}", populationFrequenciesFile.toString(), output.toString()); int counter = 0; for (rocksIterator.seekToFirst(); rocksIterator.isValid(); { VariantAvro variantAvro = mapper.readValue(rocksIterator.value(), VariantAvro.class); // The additional attributes field initialized with an empty map is used as the flag to indicate that // this variant was not visited during the annotation process if (variantAvro.getAnnotation().getAdditionalAttributes() == null) { dataWriter.write(new Variant(variantAvro)); } counter++; if (counter % 10000 == 0) {"{} written", counter); } }; dataWriter.close();"Done."); } } private void setChromosomeList() { if (variantAnnotationCommandOptions.chromosomeList != null && !variantAnnotationCommandOptions.chromosomeList.isEmpty()) { chromosomeList = Arrays.asList(variantAnnotationCommandOptions.chromosomeList.split(","));"Setting chromosomes {} for variant annotation", chromosomeList.toString()); // If the user does not provide any chromosome, fill chromosomeList with all available chromosomes in the // database } else {"Getting full list of chromosome names in the database"); dbAdaptorFactory = new MongoDBAdaptorFactory(configuration); GenomeDBAdaptor genomeDBAdaptor = dbAdaptorFactory.getGenomeDBAdaptor(species, assembly); QueryResult queryResult = genomeDBAdaptor .getGenomeInfo(new QueryOptions("include", "")); List<Document> chromosomeDocumentList = (List<Document>) ((List<Document>) queryResult.getResult()) .get(0).get("chromosomes"); chromosomeList = new ArrayList<>(chromosomeDocumentList.size()); for (Document chromosomeDocument : chromosomeDocumentList) { chromosomeList.add((String) chromosomeDocument.get("name")); }"Available chromosomes: {}", chromosomeList.toString()); } } private DataWriter getDataWriter(String filename) { DataWriter dataWriter = null; if (benchmark) { dataWriter = new BenchmarkDataWriter("VEP", "CellBase", output); } else { if (outputFormat.equals(FileFormat.JSON)) { dataWriter = new JsonAnnotationWriter(filename); } else if (outputFormat.equals(FileFormat.VEP)) { dataWriter = new VepFormatWriter(filename); } } return dataWriter; } private List<ParallelTaskRunner.TaskWithException<String, Variant, Exception>> getStringTaskList() throws IOException { List<ParallelTaskRunner.TaskWithException<String, Variant, Exception>> variantAnnotatorTaskList = new ArrayList<>( numThreads); for (int i = 0; i < numThreads; i++) { List<VariantAnnotator> variantAnnotatorList = createAnnotators(); switch (inputFormat) { case VCF:"Using HTSJDK to read variants."); FullVcfCodec codec = new FullVcfCodec(); try (InputStream fileInputStream = input.toString().endsWith("gz") ? new GZIPInputStream(new FileInputStream(input.toFile())) : new FileInputStream(input.toFile())) { LineIterator lineIterator = codec.makeSourceFromStream(fileInputStream); VCFHeader header = (VCFHeader) codec.readActualHeader(lineIterator); VCFHeaderVersion headerVersion = codec.getVCFHeaderVersion(); variantAnnotatorTaskList.add( new VcfStringAnnotatorTask(header, headerVersion, variantAnnotatorList, normalize)); } catch (IOException e) { throw new IOException("Unable to read VCFHeader"); } break; case JSON:"Using a JSON parser to read variants..."); variantAnnotatorTaskList.add(new JsonStringAnnotatorTask(variantAnnotatorList, normalize)); break; default: break; } } return variantAnnotatorTaskList; } private List<ParallelTaskRunner.TaskWithException<Variant, Variant, Exception>> getVariantTaskList() throws IOException { List<ParallelTaskRunner.TaskWithException<Variant, Variant, Exception>> variantAnnotatorTaskList = new ArrayList<>( numThreads); for (int i = 0; i < numThreads; i++) { List<VariantAnnotator> variantAnnotatorList = createAnnotators(); variantAnnotatorTaskList.add(new VariantAnnotatorTask(variantAnnotatorList)); } return variantAnnotatorTaskList; } private void closeIndexes() throws IOException { for (int i = 0; i < dbIndexes.size(); i++) { dbIndexes.get(i).close(); dbOptions.get(i).dispose(); } if (populationFrequenciesFile != null) { File(dbLocations.get(dbLocations.size() - 1))); } } private List<VariantAnnotator> createAnnotators() { List<VariantAnnotator> variantAnnotatorList; variantAnnotatorList = new ArrayList<>(); // CellBase annotator is always called variantAnnotatorList.add(createCellBaseAnnotator()); // Include custom annotators if required if (customFiles != null) { for (int i = 0; i < customFiles.size(); i++) { if (customFiles.get(i).toString().endsWith(".vcf") || customFiles.get(i).toString().endsWith(".vcf.gz")) { variantAnnotatorList.add(new VcfVariantAnnotator(customFiles.get(i).toString(), dbIndexes.get(i), customFileIds.get(i), customFileFields.get(i))); } } } // Include population-frequencies file if required if (populationFrequenciesFile != null) { // Rocks db connection is always the last in the list int i = dbIndexes.size() - 1; variantAnnotatorList.add( new PopulationFrequenciesAnnotator(populationFrequenciesFile.toString(), dbIndexes.get(i))); } return variantAnnotatorList; } private VariantAnnotator createCellBaseAnnotator() { // Assume annotation of CellBase variation collection will always be carried out from a local installation if (local || cellBaseAnnotation) { // dbAdaptorFactory may have been already initialized at execute if annotating CellBase variation collection if (dbAdaptorFactory == null) { dbAdaptorFactory = new MongoDBAdaptorFactory(configuration); } // Normalization should just be performed in one place: before calling the annotation calculator - within the // corresponding *AnnotatorTask since the AnnotatorTasks need that the number of sent variants coincides // equals the number of returned annotations return new CellBaseLocalVariantAnnotator( new VariantAnnotationCalculator(species, assembly, dbAdaptorFactory), queryOptions); } else { try { CellBaseClient cellBaseClient; cellBaseClient = new CellBaseClient(url, configuration.getVersion(), species); logger.debug("URL set to: {}", url + ":" + port); // TODO: normalization must be carried out in the client - phase set must be sent together with the // TODO: variant string to the server for proper phase annotation by REST return new CellBaseWSVariantAnnotator(cellBaseClient, queryOptions); } catch (URISyntaxException e) { e.printStackTrace(); } } return null; } private void getIndexes() { dbIndexes = new ArrayList<>(); dbOptions = new ArrayList<>(); dbLocations = new ArrayList<>(); // Index custom files if provided if (customFiles != null) { for (int i = 0; i < customFiles.size(); i++) { if (customFiles.get(i).toString().endsWith(".vcf") || customFiles.get(i).toString().endsWith(".vcf.gz")) { Object[] dbConnection = getDBConnection(customFiles.get(i).toString() + ".idx"); RocksDB rocksDB = (RocksDB) dbConnection[0]; Options dbOption = (Options) dbConnection[1]; String dbLocation = (String) dbConnection[2]; boolean indexingNeeded = (boolean) dbConnection[3]; if (indexingNeeded) {"Creating index DB at {} ", dbLocation); indexCustomVcfFile(i, rocksDB); } else {"Index found at {}", dbLocation);"Skipping index creation"); } dbIndexes.add(rocksDB); dbOptions.add(dbOption); dbLocations.add(dbLocation); } } } // Index population frequencies file if provided if (populationFrequenciesFile != null) { // We force the creation of a new index even if there was one already - Annotation of frequencies from // these files implies deletions on the RocksDB database. Whatever is already there will probably be wrong Object[] dbConnection = getDBConnection(populationFrequenciesFile + ".idx", true); RocksDB rocksDB = (RocksDB) dbConnection[0]; Options dbOption = (Options) dbConnection[1]; String dbLocation = (String) dbConnection[2];"Creating index DB at {} ", dbLocation); indexPopulationFrequencies(rocksDB); dbIndexes.add(rocksDB); dbOptions.add(dbOption); dbLocations.add(dbLocation); } } private void indexPopulationFrequencies(RocksDB db) { ObjectMapper jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = jsonObjectMapper.writer(); try { DataReader<Variant> dataReader = new JsonVariantReader(populationFrequenciesFile.toString());; dataReader.pre(); int lineCounter = 0; List<Variant> variant =; while (variant != null) { db.put(VariantAnnotationUtils.buildVariantId(variant.get(0).getChromosome(), variant.get(0).getStart(), variant.get(0).getReference(), variant.get(0).getAlternate()) .getBytes(), jsonObjectWriter.writeValueAsBytes(variant.get(0).getImpl())); lineCounter++; if (lineCounter % 100000 == 0) {"{} lines indexed", lineCounter); } variant =; }; dataReader.close(); } catch (IOException | RocksDBException e) { e.printStackTrace(); } } private Object[] getDBConnection(String dbLocation) { return getDBConnection(dbLocation, false); } private Object[] getDBConnection(String dbLocation, boolean forceCreate) { boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); // a static method that loads the RocksDB C++ library. RocksDB.loadLibrary(); // the Options class contains a set of configurable DB options // that determines the behavior of a database. Options options = new Options().setCreateIfMissing(true); RocksDB db = null; try { // a factory method that returns a RocksDB instance if (indexingNeeded) { db =, dbLocation); } else { db = RocksDB.openReadOnly(options, dbLocation); } // do something } catch (RocksDBException e) { // do some error handling e.printStackTrace(); System.exit(1); } return new Object[] { db, options, dbLocation, indexingNeeded }; } private void indexCustomVcfFile(int customFileNumber, RocksDB db) { ObjectMapper jsonObjectMapper = new ObjectMapper(); ObjectWriter jsonObjectWriter = jsonObjectMapper.writer(); try { VCFFileReader vcfFileReader = new VCFFileReader(customFiles.get(customFileNumber).toFile(), false); Iterator<VariantContext> iterator = vcfFileReader.iterator(); VariantContextToVariantConverter converter = new VariantContextToVariantConverter("", "", vcfFileReader.getFileHeader().getSampleNamesInOrder()); VariantNormalizer normalizer = new VariantNormalizer(true, false, true); int lineCounter = 0; while (iterator.hasNext()) { VariantContext variantContext =; // Reference positions will not be indexed if (variantContext.getAlternateAlleles().size() > 0) { List<Variant> variantList = normalizer .normalize(converter.apply(Collections.singletonList(variantContext)), true); for (Variant variant : variantList) { db.put((variant.getChromosome() + "_" + variant.getStart() + "_" + variant.getReference() + "_" + variant.getAlternate()).getBytes(), jsonObjectWriter.writeValueAsBytes(parseInfoAttributes(variant, customFileNumber))); } } lineCounter++; if (lineCounter % 100000 == 0) {"{} lines indexed", lineCounter); } } vcfFileReader.close(); } catch (IOException | RocksDBException | NonStandardCompliantSampleField e) { e.printStackTrace(); System.exit(1); } } protected Map<String, String> parseInfoAttributes(Variant variant, int customFileNumber) { Map<String, String> infoMap = variant.getStudies().get(0).getFiles().get(0).getAttributes(); Map<String, String> parsedInfo = new HashMap<>(); for (String attribute : infoMap.keySet()) { if (customFileFields.get(customFileNumber).contains(attribute)) { parsedInfo.put(attribute, infoMap.get(attribute)); // parsedInfo.put(attribute, getValueFromString(infoMap.get(attribute))); } } return parsedInfo; } @Deprecated protected List<Map<String, Object>> parseInfoAttributes(String info, int numAlleles, int customFileNumber) { List<Map<String, Object>> infoAttributes = new ArrayList<>(numAlleles); for (int i = 0; i < numAlleles; i++) { infoAttributes.add(new HashMap<>()); } for (String var : info.split(";")) { String[] splits = var.split("="); if (splits.length == 2 && customFileFields.get(customFileNumber).contains(splits[0])) { // Managing values for the allele String[] values = splits[1].split(","); // numAlleles and values.length may be different. For example, in the Exac vcf AN presents just one // value even if there are multiple alleles or, for example, for the AC_Het provide counts for all posible // heterozigous genotypes. In those cases, the hole string is pasted to all alleles if (values.length == numAlleles) { for (int i = 0; i < numAlleles; i++) { infoAttributes.get(i).put(splits[0], getValueFromString(values[i])); } } else { for (int i = 0; i < numAlleles; i++) { infoAttributes.get(i).put(splits[0], getValueFromString(splits[1])); } } } } return infoAttributes; } private Object getValueFromString(String value) { if (NumberUtils.isNumber(value)) { try { return Integer.parseInt(value); } catch (NumberFormatException e) { try { return Float.parseFloat(value); } catch (NumberFormatException e1) { return Double.parseDouble(value); } } } else { return value; } } private void checkParameters() throws IOException { // Run benchmark benchmark = variantAnnotationCommandOptions.benchmark; if (benchmark) { referenceFasta = Paths.get(variantAnnotationCommandOptions.referenceFasta); FileUtils.checkFile(referenceFasta); } // Use cache // queryOptions.put("useCache", variantAnnotationCommandOptions.noCache ? "false" : "true"); queryOptions.put("useCache", !variantAnnotationCommandOptions.noCache); queryOptions.put("phased", variantAnnotationCommandOptions.phased); // input file if (variantAnnotationCommandOptions.input != null) { input = Paths.get(variantAnnotationCommandOptions.input); if (benchmark) { FileUtils.checkDirectory(input); normalize = false; } else { normalize = !variantAnnotationCommandOptions.skipNormalize; FileUtils.checkFile(input); String fileName = input.toFile().getName(); if (fileName.endsWith(".vcf") || fileName.endsWith(".vcf.gz")) { inputFormat = FileFormat.VCF; } else if (fileName.endsWith(".json") || fileName.endsWith(".json.gz")) { inputFormat = FileFormat.JSON; } else { throw new ParameterException( "Only VCF and JSON formats are currently accepted. Please provide a " + "valid .vcf, .vcf.gz, json or .json.gz file"); } } // Expected to read from variation collection - normalization must be avoided } else { normalize = false; } // output file if (variantAnnotationCommandOptions.output != null) { output = Paths.get(variantAnnotationCommandOptions.output); // Path outputDir = output.getParent(); try { FileUtils.checkDirectory(output.getParent()); } catch (IOException e) { throw new ParameterException(e); } // if (!outputDir.toFile().exists()) { // throw new ParameterException("Output directory " + outputDir + " doesn't exist"); // } else if (output.toFile().isDirectory()) { // throw new ParameterException("Output file cannot be a directory: " + output); // } } else { throw new ParameterException("Please check command line sintax. Provide a valid output file name."); } if (variantAnnotationCommandOptions.outputFormat != null) { switch (variantAnnotationCommandOptions.outputFormat.toLowerCase()) { case "json": outputFormat = FileFormat.JSON; break; case "vep": outputFormat = FileFormat.VEP; break; default: throw new ParameterException( "Only JSON and VEP output formats are currently available. Please, select one of them."); } } if (variantAnnotationCommandOptions.include != null && !variantAnnotationCommandOptions.include.isEmpty()) { queryOptions.add("include", variantAnnotationCommandOptions.include); } if (variantAnnotationCommandOptions.exclude != null && !variantAnnotationCommandOptions.exclude.isEmpty()) { queryOptions.add("exclude", variantAnnotationCommandOptions.exclude); } // Num threads if (variantAnnotationCommandOptions.numThreads > 1) { numThreads = variantAnnotationCommandOptions.numThreads; } else { numThreads = 1; logger.warn("Incorrect number of numThreads, it must be a positive value. This has been reset to '{}'", numThreads); } // Batch size if (variantAnnotationCommandOptions.batchSize >= 1 && variantAnnotationCommandOptions.batchSize <= 2000) { batchSize = variantAnnotationCommandOptions.batchSize; } else { batchSize = 1; logger.warn( "Incorrect size of batch size, it must be a positive value between 1-1000. This has been set to '{}'", batchSize); } // Direct connection to local MongoDB local = variantAnnotationCommandOptions.local; if (!variantAnnotationCommandOptions.local) { // Url if (variantAnnotationCommandOptions.url != null) { url = variantAnnotationCommandOptions.url; } else { throw new ParameterException( "Please check command line sintax. Provide a valid URL to access CellBase web services."); } } // Species if (variantAnnotationCommandOptions.species != null) { species = variantAnnotationCommandOptions.species; } else { throw new ParameterException("Please check command line syntax. Provide a valid species name."); } // Assembly if (variantAnnotationCommandOptions.assembly != null) { assembly = variantAnnotationCommandOptions.assembly; } else { assembly = null; logger.warn("No assembly provided. Using default assembly for {}", species); } // Custom files if (variantAnnotationCommandOptions.customFiles != null) { String[] customFileStrings = variantAnnotationCommandOptions.customFiles.split(","); customFiles = new ArrayList<>(customFileStrings.length); for (String customFile : customFileStrings) { Path customFilePath = Paths.get(customFile); FileUtils.checkFile(customFilePath); if (!(customFilePath.toString().endsWith(".vcf") || customFilePath.toString().endsWith(".vcf.gz"))) { throw new ParameterException( "Only VCF format is currently accepted for custom annotation files."); } customFiles.add(customFilePath); } if (variantAnnotationCommandOptions.customFileIds == null) { throw new ParameterException( "Parameter --custom-file-ids missing. Please, provide one short id for each custom file in " + "a comma separated list (no spaces in betwen)."); } customFileIds = Arrays.asList(variantAnnotationCommandOptions.customFileIds.split(",")); if (customFileIds.size() != customFiles.size()) { throw new ParameterException("Different number of custom files and custom file ids. Please, " + "provide one short id for each custom file in a comma separated list (no spaces in between)."); } if (variantAnnotationCommandOptions.customFileFields == null) { throw new ParameterException( "Parameter --custom-file-fields missing. Please, provide one list of fields for each " + "custom file in a colon separated list (no spaces in betwen)."); } String[] customFileFieldStrings = variantAnnotationCommandOptions.customFileFields.split(":"); if (customFileFieldStrings.length != customFiles.size()) { throw new ParameterException("Different number of custom files and lists of custom file fields. " + "Please, provide one list of fields for each custom file in a colon separated list (no spaces in between)."); } customFileFields = new ArrayList<>(customFileStrings.length); for (String fieldString : customFileFieldStrings) { customFileFields.add(Arrays.asList(fieldString.split(","))); } } // Semi-private build parameter for us to build the variation collection including population frequencies if (variantAnnotationCommandOptions.buildParams.get("population-frequencies") != null) { populationFrequenciesFile = Paths .get(variantAnnotationCommandOptions.buildParams.get("population-frequencies")); FileUtils.checkFile(populationFrequenciesFile); if (!(populationFrequenciesFile.toString().endsWith(".json") || populationFrequenciesFile.toString().endsWith(".json.gz"))) { throw new ParameterException( "Population frequencies file must be a .json (.json.gz) file containing" + " Variant objects."); } } // Annotate variation collection in CellBase cellBaseAnnotation = variantAnnotationCommandOptions.cellBaseAnnotation; // The list of chromosomes will only be used if annotating the variation collection if (cellBaseAnnotation) { // This will set chromosomeList with the list of chromosomes to annotate setChromosomeList(); } } }