Java tutorial
/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.mongodb.variant; import com.google.common.collect.BiMap; import com.google.common.collect.LinkedListMultimap; import com.google.common.collect.ListMultimap; import org.apache.commons.lang3.time.StopWatch; import org.bson.Document; import org.opencb.biodata.formats.variant.io.VariantReader; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.VariantStudy; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.mongodb.MongoDBCollection; import org.opencb.commons.run.ParallelTaskRunner; import org.opencb.opencga.core.common.ProgressLogger; import org.opencb.opencga.storage.core.config.StorageConfiguration; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.BatchFileOperation; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager; import org.opencb.opencga.storage.core.variant.VariantStoragePipeline; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.adaptors.VariantSourceDBAdaptor; import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils; import org.opencb.opencga.storage.mongodb.variant.adaptors.VariantMongoDBAdaptor; import org.opencb.opencga.storage.mongodb.variant.converters.DocumentToSamplesConverter; import org.opencb.opencga.storage.mongodb.variant.exceptions.MongoVariantStorageEngineException; import org.opencb.opencga.storage.mongodb.variant.load.MongoDBVariantWriteResult; import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageConverterTask; import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageLoader; import org.opencb.opencga.storage.mongodb.variant.load.stage.MongoDBVariantStageReader; import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBOperations; import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBVariantMergeLoader; import org.opencb.opencga.storage.mongodb.variant.load.variants.MongoDBVariantMerger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URI; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Predicate; import static org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options; import static org.opencb.opencga.storage.mongodb.variant.MongoDBVariantStorageEngine.MongoDBVariantOptions.*; /** * Created on 30/03/16. * * @author Jacobo Coll <jacobo167@gmail.com> */ public class MongoDBVariantStoragePipeline extends VariantStoragePipeline { private final VariantMongoDBAdaptor dbAdaptor; private final ObjectMap loadStats = new ObjectMap(); public MongoDBVariantStoragePipeline(StorageConfiguration configuration, String storageEngineId, VariantMongoDBAdaptor dbAdaptor) { super(configuration, storageEngineId, LoggerFactory.getLogger(MongoDBVariantStoragePipeline.class), dbAdaptor, new VariantReaderUtils()); this.dbAdaptor = dbAdaptor; } public URI preLoad(URI input, URI output) throws StorageEngineException { URI uri = super.preLoad(input, output); if (isResumeStage(options)) { logger.info("Resume stage load."); // Clean stage collection? } return uri; } @Override protected void securePreLoad(StudyConfiguration studyConfiguration, VariantSource source) throws StorageEngineException { super.securePreLoad(studyConfiguration, source); int fileId = options.getInt(Options.FILE_ID.key()); if (studyConfiguration.getAttributes().containsKey(DEFAULT_GENOTYPE.key())) { Set<String> defaultGenotype = new HashSet<>( studyConfiguration.getAttributes().getAsStringList(DEFAULT_GENOTYPE.key())); logger.debug("Using default genotype from study configuration: {}", defaultGenotype); } else { Set<String> defaultGenotype; if (options.containsKey(DEFAULT_GENOTYPE.key())) { defaultGenotype = new HashSet<>(options.getAsStringList(DEFAULT_GENOTYPE.key())); } else { VariantStudy.StudyType studyType = options.get(Options.STUDY_TYPE.key(), VariantStudy.StudyType.class, Options.STUDY_TYPE.defaultValue()); switch (studyType) { case FAMILY: case TRIO: case PAIRED: case PAIRED_TUMOR: defaultGenotype = Collections.singleton(DocumentToSamplesConverter.UNKNOWN_GENOTYPE); logger.debug("Do not compress genotypes. Default genotype : {}", defaultGenotype); break; default: defaultGenotype = new HashSet<>(DEFAULT_GENOTYPE.defaultValue()); logger.debug("No default genotype found. Using default genotype: {}", defaultGenotype); break; } } studyConfiguration.getAttributes().put(DEFAULT_GENOTYPE.key(), defaultGenotype); } boolean newSampleBatch = checkCanLoadSampleBatch(studyConfiguration, fileId); if (newSampleBatch) { logger.info("New sample batch!!!"); //TODO: Check if there are regions with gaps // ArrayList<Integer> indexedFiles = new ArrayList<>(studyConfiguration.getIndexedFiles()); // if (!indexedFiles.isEmpty()) { // LinkedHashSet<Integer> sampleIds = studyConfiguration.getSamplesInFiles().get(indexedFiles.get(indexedFiles.size() - 1)); // if (!sampleIds.isEmpty()) { // Integer sampleId = sampleIds.iterator().next(); // String files = ""; // for (Integer indexedFileId : indexedFiles) { // if (studyConfiguration.getSamplesInFiles().get(indexedFileId).contains(sampleId)) { // files += "!" + indexedFileId + ";"; // } // } //// String genotypes = sampleIds.stream().map(i -> studyConfiguration.getSampleIds().inverse().get(i) + ":" + // DBObjectToSamplesConverter.UNKNOWN_GENOTYPE).collect(Collectors.joining(",")); // String genotypes = sampleId + ":" + DBObjectToSamplesConverter.UNKNOWN_GENOTYPE; // Long v = getDBAdaptor(null).count(new Query() // .append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId()) // .append(VariantDBAdaptor.VariantQueryParams.FILES.key(), files) // .append(VariantDBAdaptor.VariantQueryParams.GENOTYPE.key(), genotypes)).first(); // } // } } boolean doMerge = options.getBoolean(MERGE.key(), false); boolean doStage = options.getBoolean(STAGE.key(), false); if (!doMerge && !doStage) { doMerge = true; doStage = true; } options.put(MERGE.key(), doMerge); options.put(STAGE.key(), doStage); securePreStage(fileId, studyConfiguration); // QueryResult<Long> countResult = dbAdaptor.count(new Query(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration // .getStudyId()) // .append(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileId)); // Long count = countResult.first(); // if (count != 0) { // logger.warn("Resume mode. There are already loaded variants from the file " // + studyConfiguration.getFileIds().inverse().get(fileId) + " : " + fileId + " "); // options.put(ALREADY_LOADED_VARIANTS.key(), count); // } } @Override public URI load(URI inputUri) throws IOException, StorageEngineException { // boolean includeSamples = options.getBoolean(Options.INCLUDE_GENOTYPES.key(), Options.INCLUDE_GENOTYPES.defaultValue()); // boolean includeStats = options.getBoolean(Options.INCLUDE_STATS.key(), Options.INCLUDE_STATS.defaultValue()); // boolean includeSrc = options.getBoolean(Options.INCLUDE_SRC.key(), Options.INCLUDE_SRC.defaultValue()); // boolean compressGenotypes = options.getBoolean(Options.COMPRESS_GENOTYPES.key(), false); // boolean compressGenotypes = defaultGenotype != null && !defaultGenotype.isEmpty(); boolean doMerge = options.getBoolean(MERGE.key(), false); boolean doStage = options.getBoolean(STAGE.key(), false); final int fileId = options.getInt(Options.FILE_ID.key()); logger.info("Loading variants..."); long start = System.currentTimeMillis(); if (doStage) { stage(inputUri); } long skippedVariants = options.getLong("skippedVariants"); if (doMerge) { MongoDBVariantWriteResult writeResult = merge(Collections.singletonList(fileId), skippedVariants); } long end = System.currentTimeMillis(); logger.info("end - start = " + (end - start) / 1000.0 + "s"); logger.info("Variants loaded!"); return inputUri; //TODO: Return something like this: mongo://<host>/<dbName>/<collectionName> } public void stage(URI inputUri) throws StorageEngineException { final int fileId = options.getInt(Options.FILE_ID.key()); if (!options.getBoolean(STAGE.key(), false)) { // Do not stage! return; } Path input = Paths.get(inputUri.getPath()); VariantSource source = readVariantSource(inputUri, null); int numRecords = source.getStats().getNumRecords(); int batchSize = options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue()); int bulkSize = options.getInt(BULK_SIZE.key(), batchSize); int loadThreads = options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue()); final int numReaders = 1; // final int numTasks = loadThreads == 1 ? 1 : loadThreads - numReaders; //Subtract the reader thread MongoDBCollection stageCollection = dbAdaptor.getStageCollection(); try { StudyConfiguration studyConfiguration = getStudyConfiguration(); //Reader VariantReader variantReader; variantReader = VariantReaderUtils.getVariantReader(input, source); //Remapping ids task String fileIdStr = options.getString(Options.FILE_ID.key()); ParallelTaskRunner.Task<Variant, Variant> remapIdsTask = batch -> { batch.forEach(variant -> variant.getStudies().forEach(studyEntry -> { studyEntry.setStudyId(Integer.toString(studyConfiguration.getStudyId())); studyEntry.getFiles().forEach(fileEntry -> fileEntry.setFileId(fileIdStr)); })); return batch; }; //Runner ProgressLogger progressLogger = new ProgressLogger("Write variants in STAGE collection:", numRecords, 200); MongoDBVariantStageConverterTask converterTask = new MongoDBVariantStageConverterTask(progressLogger); MongoDBVariantStageLoader stageLoader = new MongoDBVariantStageLoader(stageCollection, studyConfiguration.getStudyId(), fileId, isResumeStage(options)); ParallelTaskRunner<Variant, ?> ptr; ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder().setReadQueuePutTimeout(20 * 60) .setNumTasks(loadThreads).setBatchSize(batchSize).setAbortOnFail(true).build(); if (options.getBoolean(STAGE_PARALLEL_WRITE.key(), STAGE_PARALLEL_WRITE.defaultValue())) { logger.info("Multi thread stage load... [{} readerThreads, {} writerThreads]", numReaders, loadThreads); ptr = new ParallelTaskRunner<>(variantReader, remapIdsTask.then(converterTask).then(stageLoader), null, config); } else { logger.info("Multi thread stage load... [{} readerThreads, {} tasks, {} writerThreads]", numReaders, loadThreads, 1); ptr = new ParallelTaskRunner<>(variantReader, remapIdsTask.then(converterTask), stageLoader, config); } Thread hook = new Thread(() -> { try { logger.error("Stage shutdown hook!"); stageError(); } catch (StorageEngineException e) { logger.error("Error at shutdown", e); throw new RuntimeException(e); } }); try { Runtime.getRuntime().addShutdownHook(hook); ptr.run(); stageSuccess(source); } finally { Runtime.getRuntime().removeShutdownHook(hook); } long skippedVariants = converterTask.getSkippedVariants(); stageLoader.getWriteResult().setSkippedVariants(skippedVariants); loadStats.append(MERGE.key(), false); loadStats.append("stageWriteResult", stageLoader.getWriteResult()); options.put("skippedVariants", skippedVariants); logger.info("Stage Write result: {}", skippedVariants); } catch (ExecutionException | RuntimeException e) { try { stageError(); } catch (Exception e2) { // Do not propagate this exception! logger.error("Error reporting stage error!", e2); } throw new StorageEngineException("Error while executing STAGE variants", e); } } /** * Check can stage this file. * * - The file is not staged * - The file is not being staged * */ private BatchFileOperation preStage(int fileId) throws StorageEngineException { StudyConfigurationManager scm = dbAdaptor.getStudyConfigurationManager(); AtomicReference<BatchFileOperation> operation = new AtomicReference<>(); scm.lockAndUpdate(getStudyId(), studyConfiguration -> { operation.set(securePreStage(fileId, studyConfiguration)); return studyConfiguration; }); return operation.get(); } private BatchFileOperation securePreStage(int fileId, StudyConfiguration studyConfiguration) throws StorageEngineException { String fileName = studyConfiguration.getFileIds().inverse().get(fileId); Query query = new Query() .append(VariantSourceDBAdaptor.VariantSourceQueryParam.STUDY_ID.key(), studyConfiguration.getStudyId()) .append(VariantSourceDBAdaptor.VariantSourceQueryParam.FILE_ID.key(), fileId); Iterator<VariantSource> iterator = dbAdaptor.getVariantSourceDBAdaptor().iterator(query, new QueryOptions()); boolean loadStageResume = false; boolean stage = true; BatchFileOperation operation = getBatchFileOperation(studyConfiguration.getBatches(), op -> op.getOperationName().equals(STAGE.key()) && op.getFileIds().equals(Collections.singletonList(fileId))); if (iterator.hasNext()) { // Already indexed! logger.info("File \"{}\" ({}) already staged!", fileName, fileId); stage = false; if (operation != null && !operation.currentStatus().equals(BatchFileOperation.Status.READY)) { // There was an error writing the operation status. Restore to "READY" operation.addStatus(BatchFileOperation.Status.READY); } } else { loadStageResume = isResumeStage(options); if (operation != null) { switch (operation.currentStatus()) { case READY: // Already indexed! // TODO: Believe this ready? What if deleted? logger.info("File \"{}\" ({}) already staged!", fileName, fileId); stage = false; //dbAdaptor.getVariantSourceDBAdaptor().updateVariantSource(source); break; case RUNNING: if (!loadStageResume) { throw MongoVariantStorageEngineException.fileBeingStagedException(fileId, fileName); } case ERROR: // Resume stage loadStageResume = true; options.put(STAGE_RESUME.key(), true); break; default: throw new IllegalStateException("Unknown status: " + operation.currentStatus()); } } else { operation = new BatchFileOperation(STAGE.key(), Collections.singletonList(fileId), System.currentTimeMillis(), BatchFileOperation.Type.OTHER); studyConfiguration.getBatches().add(operation); } if (stage) { operation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING); } } if (stage) { BatchFileOperation mergeOperation = getBatchFileOperation(studyConfiguration.getBatches(), op -> op.getOperationName().equals(MERGE.key()) && !op.currentStatus().equals(BatchFileOperation.Status.READY)); if (mergeOperation != null) { // Avoid stage new files if there are ongoing merge operations throw MongoVariantStorageEngineException.operationInProgressException(mergeOperation); } } options.put(STAGE.key(), stage); return operation; } private BatchFileOperation getBatchFileOperation(List<BatchFileOperation> batches, Predicate<BatchFileOperation> filter) { for (int i = batches.size() - 1; i >= 0; i--) { BatchFileOperation op = batches.get(i); if (filter.test(op)) { return op; } } return null; } public void stageError() throws StorageEngineException { int fileId = options.getInt(Options.FILE_ID.key()); setStatus(BatchFileOperation.Status.ERROR, STAGE.key(), Collections.singletonList(fileId)); } public void stageSuccess(VariantSource source) throws StorageEngineException { // Stage loading finished. Save VariantSource and update BatchOperation source.setFileId(options.getString(Options.FILE_ID.key())); source.setStudyId(options.getString(Options.STUDY_ID.key())); setStatus(BatchFileOperation.Status.READY, STAGE.key(), Collections.singletonList(options.getInt(Options.FILE_ID.key()))); dbAdaptor.getVariantSourceDBAdaptor().updateVariantSource(source); } /** * Merge staged files into Variant collection. * * @param fileIds FileIDs of the files to be merged * @return Write Result with times and count * @throws StorageEngineException If there is a problem executing the {@link ParallelTaskRunner} */ public MongoDBVariantWriteResult merge(List<Integer> fileIds) throws StorageEngineException { return merge(fileIds, options.getInt("skippedVariants", 0)); } /** * Merge staged files into Variant collection. * * 1- Find if the files are in different chromosomes. * 2- If splitted, call once per chromosome. Else, call only once. * * @see MongoDBVariantMerger * * @param fileIds FileIDs of the files to be merged * @param skippedVariants Number of skipped variants into the Stage * @return Write Result with times and count * @throws StorageEngineException If there is a problem executing the {@link ParallelTaskRunner} */ protected MongoDBVariantWriteResult merge(List<Integer> fileIds, long skippedVariants) throws StorageEngineException { long start = System.currentTimeMillis(); options.put(Options.FILE_ID.key(), fileIds); StudyConfiguration studyConfiguration = preMerge(fileIds); //Stage collection where files are loaded. MongoDBCollection stageCollection = dbAdaptor.getStageCollection(); int batchSize = options.getInt(Options.LOAD_BATCH_SIZE.key(), Options.LOAD_BATCH_SIZE.defaultValue()); int loadThreads = options.getInt(Options.LOAD_THREADS.key(), Options.LOAD_THREADS.defaultValue()); int capacity = options.getInt("blockingQueueCapacity", loadThreads * 2); //Iterate over all the files Query query = new Query(VariantSourceDBAdaptor.VariantSourceQueryParam.STUDY_ID.key(), studyConfiguration.getStudyId()); Iterator<VariantSource> iterator = dbAdaptor.getVariantSourceDBAdaptor().iterator(query, null); // List of chromosomes to be loaded Set<String> chromosomesToLoad = new HashSet<>(); // List of all the indexed files that cover each chromosome ListMultimap<String, Integer> chromosomeInLoadedFiles = LinkedListMultimap.create(); // List of all the indexed files that cover each chromosome ListMultimap<String, Integer> chromosomeInFilesToLoad = LinkedListMultimap.create(); Set<String> wholeGenomeFiles = new HashSet<>(); Set<String> byChromosomeFiles = new HashSet<>(); while (iterator.hasNext()) { VariantSource variantSource = iterator.next(); int fileId = Integer.parseInt(variantSource.getFileId()); // If the file is going to be loaded, check if covers just one chromosome if (fileIds.contains(fileId)) { if (variantSource.getStats().getChromosomeCounts().size() == 1) { chromosomesToLoad.addAll(variantSource.getStats().getChromosomeCounts().keySet()); byChromosomeFiles.add(variantSource.getFileName()); } else { wholeGenomeFiles.add(variantSource.getFileName()); } } // If the file is indexed, add to the map of chromosome->fileId for (String chromosome : variantSource.getStats().getChromosomeCounts().keySet()) { if (studyConfiguration.getIndexedFiles().contains(fileId)) { chromosomeInLoadedFiles.put(chromosome, fileId); } else if (fileIds.contains(fileId)) { chromosomeInFilesToLoad.put(chromosome, fileId); } // else { ignore files that are not loaded, and are not going to be loaded } } } final MongoDBVariantWriteResult writeResult; if (options.getBoolean(MERGE_SKIP.key())) { // It was already merged, but still some work is needed. Exit to do postLoad step writeResult = new MongoDBVariantWriteResult(); } else { Thread hook = new Thread(() -> { try { logger.error("Merge shutdown hook!"); setStatus(BatchFileOperation.Status.ERROR, MERGE.key(), fileIds); } catch (StorageEngineException e) { logger.error("Failed setting status '" + MERGE.key() + "' operation over files " + fileIds + " to '" + BatchFileOperation.Status.ERROR + '\'', e); throw new RuntimeException(e); } }); Runtime.getRuntime().addShutdownHook(hook); try { if (!wholeGenomeFiles.isEmpty() && !byChromosomeFiles.isEmpty()) { String message = "Impossible to merge files splitted and not splitted by chromosome at the same time! " + "Files covering only one chromosome: " + byChromosomeFiles + ". " + "Files covering more than one chromosome: " + wholeGenomeFiles; logger.error(message); throw new StorageEngineException(message); } if (chromosomesToLoad.isEmpty()) { writeResult = mergeByChromosome(fileIds, batchSize, loadThreads, stageCollection, studyConfiguration, null, studyConfiguration.getIndexedFiles()); } else { writeResult = new MongoDBVariantWriteResult(); for (String chromosome : chromosomesToLoad) { List<Integer> filesToLoad = chromosomeInFilesToLoad.get(chromosome); Set<Integer> indexedFiles = new HashSet<>(chromosomeInLoadedFiles.get(chromosome)); MongoDBVariantWriteResult aux = mergeByChromosome(filesToLoad, batchSize, loadThreads, stageCollection, studyConfiguration, chromosome, indexedFiles); writeResult.merge(aux); } } } catch (Exception e) { setStatus(BatchFileOperation.Status.ERROR, MERGE.key(), fileIds); throw e; } finally { Runtime.getRuntime().removeShutdownHook(hook); } setStatus(BatchFileOperation.Status.DONE, MERGE.key(), fileIds); } if (!options.getBoolean(STAGE_CLEAN_WHILE_LOAD.key(), STAGE_CLEAN_WHILE_LOAD.defaultValue())) { StopWatch time = StopWatch.createStarted(); logger.info("Deleting variant records from Stage collection"); long modifiedCount = MongoDBVariantStageLoader.cleanStageCollection(stageCollection, studyConfiguration.getStudyId(), fileIds, chromosomesToLoad, writeResult); logger.info("Delete variants time: " + time.getTime(TimeUnit.MILLISECONDS) / 1000.0 + "s , CleanDocuments: " + modifiedCount); } writeResult.setSkippedVariants(skippedVariants); logger.info("Write result: {}", writeResult.toString()); // logger.info("Write result: {}", writeResult.toTSV()); logger.info("Write result: {}", writeResult.toJson()); options.put("writeResult", writeResult); loadStats.append(MERGE.key(), true); loadStats.append("mergeWriteResult", writeResult); long end = System.currentTimeMillis(); logger.info("end - start = " + (end - start) / 1000.0 + "s"); logger.info("Variants merged!"); return writeResult; } private StudyConfiguration preMerge(List<Integer> fileIds) throws StorageEngineException { int studyId = getStudyId(); Set<Integer> fileIdsSet = new HashSet<>(fileIds); return dbAdaptor.getStudyConfigurationManager().lockAndUpdate(studyId, studyConfiguration -> { for (Integer fileId : fileIds) { if (studyConfiguration.getIndexedFiles().contains(fileId)) { throw StorageEngineException.alreadyLoaded(fileId, studyConfiguration); } } boolean loadMergeResume = isResumeMerge(options); List<BatchFileOperation> batches = studyConfiguration.getBatches(); BatchFileOperation operation = null; for (int i = batches.size() - 1; i >= 0; i--) { BatchFileOperation op = batches.get(i); if (op.getOperationName().equals(MERGE.key()) && fileIds.size() == op.getFileIds().size() && fileIdsSet.containsAll(op.getFileIds())) { switch (op.currentStatus()) { case READY:// Already indexed! // TODO: Believe this ready? What if deleted? // It was not "indexed" so suppose "deleted" ? break; case DONE: // Already merged but still needs some work. logger.info("Files " + fileIds + " where already merged, but where not marked as indexed files."); options.put(MERGE_SKIP.key(), true); case RUNNING: if (!loadMergeResume) { throw MongoVariantStorageEngineException.filesBeingMergedException(fileIds); } break; case ERROR: // Resume merge loadMergeResume = true; options.put(MERGE_RESUME.key(), loadMergeResume); break; default: throw new IllegalStateException("Unknown status: " + op.currentStatus()); } operation = op; break; } else { // Can not merge any file if there is an ongoing MERGE or STAGE operation if (op.getOperationName().equals(MERGE.key()) || op.getOperationName().equals(STAGE.key())) { if (!op.currentStatus().equals(BatchFileOperation.Status.READY)) { throw MongoVariantStorageEngineException.operationInProgressException(op); } } } } if (operation == null) { operation = new BatchFileOperation(MERGE.key(), fileIds, System.currentTimeMillis(), BatchFileOperation.Type.LOAD); studyConfiguration.getBatches().add(operation); operation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING); } else if (operation.currentStatus() == BatchFileOperation.Status.ERROR) { // Only set to RUNNING if it was on ERROR operation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING); } return studyConfiguration; }); } private MongoDBVariantWriteResult mergeByChromosome(List<Integer> fileIds, int batchSize, int loadThreads, MongoDBCollection stageCollection, StudyConfiguration studyConfiguration, String chromosomeToLoad, Set<Integer> indexedFiles) throws StorageEngineException { MongoDBVariantStageReader reader = new MongoDBVariantStageReader(stageCollection, studyConfiguration.getStudyId(), chromosomeToLoad == null ? Collections.emptyList() : Collections.singletonList(chromosomeToLoad)); boolean resume = isResumeMerge(options); boolean cleanWhileLoading = options.getBoolean(STAGE_CLEAN_WHILE_LOAD.key(), STAGE_CLEAN_WHILE_LOAD.defaultValue()); ProgressLogger progressLogger = new ProgressLogger("Write variants in VARIANTS collection:", reader::countNumVariants, 200); progressLogger.setApproximateTotalCount(reader.countAproxNumVariants()); boolean ignoreOverlapping = options.getBoolean(MERGE_IGNORE_OVERLAPPING_VARIANTS.key(), MERGE_IGNORE_OVERLAPPING_VARIANTS.defaultValue()); MongoDBVariantMerger variantMerger = new MongoDBVariantMerger(dbAdaptor, studyConfiguration, fileIds, indexedFiles, resume, ignoreOverlapping); MongoDBVariantMergeLoader variantLoader = new MongoDBVariantMergeLoader(dbAdaptor.getVariantsCollection(), stageCollection, studyConfiguration.getStudyId(), fileIds, resume, cleanWhileLoading, progressLogger); ParallelTaskRunner<Document, MongoDBOperations> ptrMerge; ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder().setReadQueuePutTimeout(20 * 60) .setNumTasks(loadThreads).setBatchSize(batchSize).setAbortOnFail(true).build(); try { if (options.getBoolean(MERGE_PARALLEL_WRITE.key(), MERGE_PARALLEL_WRITE.defaultValue())) { ptrMerge = new ParallelTaskRunner<>(reader, variantMerger.then(variantLoader), null, config); } else { ptrMerge = new ParallelTaskRunner<>(reader, variantMerger, variantLoader, config); } } catch (RuntimeException e) { throw new StorageEngineException("Error while creating ParallelTaskRunner", e); } try { if (chromosomeToLoad != null) { logger.info("Merging files {} in chromosome: {}. Other indexed files in chromosome {}: {}", fileIds, chromosomeToLoad, chromosomeToLoad, indexedFiles); } else { logger.info("Merging files " + fileIds); } ptrMerge.run(); } catch (ExecutionException e) { logger.info("Write result: {}", variantLoader.getResult()); throw new StorageEngineException("Error while executing LoadVariants in ParallelTaskRunner", e); } return variantLoader.getResult(); } @Override public URI postLoad(URI input, URI output) throws StorageEngineException { if (options.getBoolean(MERGE.key())) { return super.postLoad(input, output); } else { return input; } } @Override public void securePostLoad(List<Integer> fileIds, StudyConfiguration studyConfiguration) throws StorageEngineException { super.securePostLoad(fileIds, studyConfiguration); BatchFileOperation.Status status = secureSetStatus(studyConfiguration, BatchFileOperation.Status.READY, MERGE.key(), fileIds); if (status != BatchFileOperation.Status.DONE) { logger.warn("Unexpected status " + status); } } @Override public ObjectMap getLoadStats() { return loadStats; } @Override protected void checkLoadedVariants(URI input, List<Integer> fileIds, StudyConfiguration studyConfiguration) throws StorageEngineException { if (fileIds.size() == 1) { checkLoadedVariants(input, fileIds.get(0), studyConfiguration); } else { // FIXME: Check variants in this situation! logger.warn("Skip check loaded variants"); } } @Override protected void checkLoadedVariants(URI input, int fileId, StudyConfiguration studyConfiguration) throws StorageEngineException { VariantSource variantSource = VariantReaderUtils.readVariantSource(Paths.get(input.getPath()), null); // VariantMongoDBAdaptor dbAdaptor = getDBAdaptor(options.getString(VariantStorageEngine.Options.DB_NAME.key())); Long count = dbAdaptor .count(new Query().append(VariantDBAdaptor.VariantQueryParams.FILES.key(), fileId) .append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())) .first(); Long overlappedCount = dbAdaptor .count(new Query().append(VariantDBAdaptor.VariantQueryParams.FILES.key(), -fileId) .append(VariantDBAdaptor.VariantQueryParams.STUDIES.key(), studyConfiguration.getStudyId())) .first(); long variantsToLoad = 0; long expectedSkippedVariants = 0; int symbolicVariants = 0; int nonVariants = 0; long alreadyLoadedVariants = options.getLong(ALREADY_LOADED_VARIANTS.key(), 0L); for (Map.Entry<String, Integer> entry : variantSource.getStats().getVariantTypeCounts().entrySet()) { if (entry.getKey().equals(VariantType.SYMBOLIC.toString())) { expectedSkippedVariants += entry.getValue(); symbolicVariants = entry.getValue(); } else if (entry.getKey().equals(VariantType.NO_VARIATION.toString())) { expectedSkippedVariants += entry.getValue(); nonVariants = entry.getValue(); } else { variantsToLoad += entry.getValue(); } } MongoDBVariantWriteResult writeResult = options.get("writeResult", MongoDBVariantWriteResult.class); long expectedCount = variantsToLoad; if (alreadyLoadedVariants != 0) { writeResult.setNonInsertedVariants(writeResult.getNonInsertedVariants() - alreadyLoadedVariants); } if (writeResult.getNonInsertedVariants() != 0) { expectedCount -= writeResult.getNonInsertedVariants(); } if (writeResult.getOverlappedVariants() != 0) { // Expect to find this file in all the overlapped variants expectedCount += writeResult.getOverlappedVariants(); } logger.info("============================================================"); logger.info("Check loaded file '" + variantSource.getFileName() + "' (" + fileId + ')'); if (expectedSkippedVariants != writeResult.getSkippedVariants()) { logger.error("Wrong number of skipped variants. Expected " + expectedSkippedVariants + " and got " + writeResult.getSkippedVariants()); } else if (writeResult.getSkippedVariants() > 0) { logger.warn("There were " + writeResult.getSkippedVariants() + " skipped variants."); if (symbolicVariants > 0) { logger.info(" * Of which " + symbolicVariants + " are " + VariantType.SYMBOLIC.toString() + " variants."); } if (nonVariants > 0) { logger.info(" * Of which " + nonVariants + " are " + VariantType.NO_VARIATION.toString() + " variants."); } } if (writeResult.getNonInsertedVariants() != 0) { logger.error( "There were " + writeResult.getNonInsertedVariants() + " duplicated variants not inserted. "); } if (alreadyLoadedVariants != 0) { logger.info("Resume mode. Previously loaded variants: " + alreadyLoadedVariants); } StorageEngineException exception = null; if (expectedCount != (count + overlappedCount)) { String message = "Wrong number of loaded variants. Expected: " + expectedCount + " and got: " + (count + overlappedCount) + " (" + count + " from file, " + overlappedCount + " overlapped)"; logger.error(message); logger.error(" * Variants to load : " + variantsToLoad); logger.error(" * Non Inserted (due to duplications) : " + writeResult.getNonInsertedVariants()); logger.error(" * Overlapped variants (extra insertions) : " + writeResult.getOverlappedVariants()); // exception = new StorageEngineException(message); } else { logger.info("Final number of loaded variants: " + count + (overlappedCount > 0 ? " + " + overlappedCount + " overlapped variants" : "")); } logger.info("============================================================"); if (exception != null) { throw exception; } } /* --------------------------------------- */ /* StudyConfiguration utils methods */ /* --------------------------------------- */ /** * Check if the samples from the selected file can be loaded. * Check if the samples from the selected file can be loaded. * <p> * MongoDB storage plugin is not able to load batches of samples in a unordered way. * A batch of samples is a group of samples of any size. It may be composed of one or several VCF files, depending * on whether it is split by region (horizontally) or not. * All the files from the same batch must be loaded, before loading the next batch. If a new batch of * samples begins to be loaded, it won't be possible to load other files from previous batches * <p> * The StudyConfiguration must be complete, with all the indexed files, and samples in files. * Provided StudyConfiguration won't be modified * Requirements: * - All samples in file must be or loaded or not loaded * - If all samples loaded, must match (same order and samples) with the last loaded file. * * @param studyConfiguration StudyConfiguration from the selected study * @param fileId File to load * @return Returns if this file represents a new batch of samples * @throws StorageEngineException If there is any unaccomplished requirement */ public static boolean checkCanLoadSampleBatch(final StudyConfiguration studyConfiguration, int fileId) throws StorageEngineException { LinkedHashSet<Integer> sampleIds = studyConfiguration.getSamplesInFiles().get(fileId); if (!sampleIds.isEmpty()) { boolean allSamplesRepeated = true; boolean someSamplesRepeated = false; BiMap<String, Integer> indexedSamples = StudyConfiguration.getIndexedSamples(studyConfiguration); for (Integer sampleId : sampleIds) { if (!indexedSamples.containsValue(sampleId)) { allSamplesRepeated = false; } else { someSamplesRepeated = true; } } if (allSamplesRepeated) { ArrayList<Integer> indexedFiles = new ArrayList<>(studyConfiguration.getIndexedFiles()); if (!indexedFiles.isEmpty()) { int lastIndexedFile = indexedFiles.get(indexedFiles.size() - 1); //Check that are the same samples in the same order if (!new ArrayList<>(studyConfiguration.getSamplesInFiles().get(lastIndexedFile)) .equals(new ArrayList<>(sampleIds))) { //ERROR if (studyConfiguration.getSamplesInFiles().get(lastIndexedFile).containsAll(sampleIds)) { throw new StorageEngineException("Unable to load this batch. Wrong samples order"); //TODO: Should it care? } else { throw new StorageEngineException( "Unable to load this batch. Another sample batch has been loaded already."); } } //Ok, the batch of samples matches with the last loaded batch of samples. return false; // This is NOT a new batch of samples } } else if (someSamplesRepeated) { throw new StorageEngineException("There was some already indexed samples, but not all of them. " + "Unable to load in Storage-MongoDB"); } } return true; // This is a new batch of samples } // @Override // public void checkStudyConfiguration(StudyConfiguration studyConfiguration, VariantDBAdaptor dbAdaptor) throws // StorageEngineException { // super.checkStudyConfiguration(studyConfiguration, dbAdaptor); // if (dbAdaptor == null) { // logger.debug("Do not check StudyConfiguration against the loaded in MongoDB"); // } else { // if (dbAdaptor instanceof VariantMongoDBAdaptor) { // VariantMongoDBAdaptor mongoDBAdaptor = (VariantMongoDBAdaptor) dbAdaptor; // StudyConfigurationManager studyConfigurationDBAdaptor = mongoDBAdaptor.getStudyConfigurationManager(); // StudyConfiguration studyConfigurationFromMongo = studyConfigurationDBAdaptor.getStudyConfiguration(studyConfiguration // .getStudyId(), null).first(); // // //Check that the provided StudyConfiguration has the same or more information that the stored in MongoDB. // for (Map.Entry<String, Integer> entry : studyConfigurationFromMongo.getFileIds().entrySet()) { // if (!studyConfiguration.getFileIds().containsKey(entry.getKey())) { // throw new StorageEngineException("StudyConfiguration do not have the file " + entry.getKey()); // } // if (!studyConfiguration.getFileIds().get(entry.getKey()).equals(entry.getValue())) { // throw new StorageEngineException("StudyConfiguration changes the fileId of '" + entry.getKey() + "' from " + // entry.getValue() + " to " + studyConfiguration.getFileIds().get(entry.getKey())); // } // } // for (Map.Entry<String, Integer> entry : studyConfigurationFromMongo.getCohortIds().entrySet()) { // if (!studyConfiguration.getCohortIds().containsKey(entry.getKey())) { // throw new StorageEngineException("StudyConfiguration do not have the cohort " + entry.getKey()); // } // if (!studyConfiguration.getCohortIds().get(entry.getKey()).equals(entry.getValue())) { // throw new StorageEngineException("StudyConfiguration changes the cohortId of '" + entry.getKey() + "' from " + // entry.getValue() + " to " + studyConfiguration.getCohortIds().get(entry.getKey())); // } // } // for (Map.Entry<String, Integer> entry : studyConfigurationFromMongo.getSampleIds().entrySet()) { // if (!studyConfiguration.getSampleIds().containsKey(entry.getKey())) { // throw new StorageEngineException("StudyConfiguration do not have the sample " + entry.getKey()); // } // if (!studyConfiguration.getSampleIds().get(entry.getKey()).equals(entry.getValue())) { // throw new StorageEngineException("StudyConfiguration changes the sampleId of '" + entry.getKey() + "' from " + // entry.getValue() + " to " + studyConfiguration.getSampleIds().get(entry.getKey())); // } // } // studyConfigurationDBAdaptor.updateStudyConfiguration(studyConfiguration, null); // } else { // throw new StorageEngineException("Unknown VariantDBAdaptor '" + dbAdaptor.getClass().toString() + "'. Expected '" + // VariantMongoDBAdaptor.class + "'"); // } // } // } }