org.opencb.opencga.storage.hadoop.variant.AbstractHadoopVariantStoragePipeline.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.opencga.storage.hadoop.variant.AbstractHadoopVariantStoragePipeline.java

Source

/*
 * Copyright 2015-2016 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.opencga.storage.hadoop.variant;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.opencb.biodata.formats.io.FileFormatException;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantNormalizer;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.protobuf.VcfMeta;
import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos;
import org.opencb.biodata.tools.variant.VariantFileUtils;
import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader;
import org.opencb.biodata.tools.variant.converters.proto.VariantToVcfSliceConverter;
import org.opencb.biodata.tools.variant.stats.VariantGlobalStatsCalculator;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.io.DataWriter;
import org.opencb.commons.run.ParallelTaskRunner;
import org.opencb.commons.utils.FileUtils;
import org.opencb.hpg.bigdata.core.io.ProtoFileWriter;
import org.opencb.opencga.core.common.ProgressLogger;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.BatchFileOperation;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.io.plain.StringDataWriter;
import org.opencb.opencga.storage.core.variant.VariantStoragePipeline;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
import org.opencb.opencga.storage.core.variant.io.json.mixin.GenericRecordAvroJsonMixin;
import org.opencb.opencga.storage.core.variant.io.json.mixin.VariantSourceJsonMixin;
import org.opencb.opencga.storage.hadoop.auth.HBaseCredentials;
import org.opencb.opencga.storage.hadoop.exceptions.StorageHadoopException;
import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveDriver;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper;
import org.opencb.opencga.storage.hadoop.variant.archive.VariantHbaseTransformTask;
import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor;
import org.opencb.opencga.storage.hadoop.variant.index.AbstractVariantTableDriver;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableDriver;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.transform.VariantSliceReader;
import org.slf4j.Logger;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.function.BiConsumer;
import java.util.function.Supplier;

import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.*;

/**
 * Created by mh719 on 13/05/2016.
 */
public abstract class AbstractHadoopVariantStoragePipeline extends VariantStoragePipeline {
    protected final VariantHadoopDBAdaptor dbAdaptor;
    protected final Configuration conf;
    protected final HBaseCredentials archiveTableCredentials;
    protected final HBaseCredentials variantsTableCredentials;
    protected MRExecutor mrExecutor = null;

    // Do not create phoenix indexes. Testing purposes only
    public static final String SKIP_CREATE_PHOENIX_INDEXES = "skip.create.phoenix.indexes";

    public AbstractHadoopVariantStoragePipeline(StorageConfiguration configuration, String storageEngineId,
            Logger logger, VariantHadoopDBAdaptor dbAdaptor, VariantReaderUtils variantReaderUtils,
            ObjectMap options, HBaseCredentials archiveCredentials, MRExecutor mrExecutor, Configuration conf) {
        super(configuration, storageEngineId, logger, dbAdaptor, variantReaderUtils, options);
        this.archiveTableCredentials = archiveCredentials;
        this.mrExecutor = mrExecutor;
        this.dbAdaptor = dbAdaptor;
        this.variantsTableCredentials = dbAdaptor == null ? null : dbAdaptor.getCredentials();
        this.conf = new Configuration(conf);
    }

    @Override
    public URI preTransform(URI input) throws StorageEngineException, IOException, FileFormatException {
        logger.info("PreTransform: " + input);
        //        ObjectMap options = configuration.getStorageEngine(STORAGE_ENGINE_ID).getVariant().getOptions();
        if (!options.containsKey(VariantStorageEngine.Options.TRANSFORM_FORMAT.key())) {
            options.put(VariantStorageEngine.Options.TRANSFORM_FORMAT.key(),
                    VariantStorageEngine.Options.TRANSFORM_FORMAT.defaultValue());
        }
        String transVal = options.getString(VariantStorageEngine.Options.TRANSFORM_FORMAT.key());
        switch (transVal) {
        case "avro":
        case "proto":
            break;
        default:
            throw new NotImplementedException(
                    String.format("Output format %s not supported for Hadoop!", transVal));
        }
        if (!options.containsKey(VariantStorageEngine.Options.GVCF.key())) {
            options.put(VariantStorageEngine.Options.GVCF.key(), true);
        }
        boolean isGvcf = options.getBoolean(VariantStorageEngine.Options.GVCF.key());
        if (!isGvcf) {
            throw new NotImplementedException("Only GVCF format supported!!!");
        }
        return super.preTransform(input);
    }

    @Override
    protected Pair<Long, Long> processProto(Path input, String fileName, Path output, VariantSource source,
            Path outputVariantsFile, Path outputMetaFile, boolean includeSrc, String parser,
            boolean generateReferenceBlocks, int batchSize, String extension, String compression,
            BiConsumer<String, RuntimeException> malformatedHandler, boolean failOnError)
            throws StorageEngineException {

        //Writer
        DataWriter<VcfSliceProtos.VcfSlice> dataWriter = new ProtoFileWriter<>(outputVariantsFile, compression);

        // Normalizer
        VariantNormalizer normalizer = new VariantNormalizer();
        normalizer.setGenerateReferenceBlocks(generateReferenceBlocks);

        // Stats calculator
        VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(source);

        VariantReader dataReader = null;
        try {
            if (VariantReaderUtils.isVcf(input.toString())) {
                InputStream inputStream = FileUtils.newInputStream(input);

                VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(inputStream, source, normalizer);
                if (null != malformatedHandler) {
                    reader.registerMalformatedVcfHandler(malformatedHandler);
                    reader.setFailOnError(failOnError);
                }
                dataReader = reader;
            } else {
                dataReader = VariantReaderUtils.getVariantReader(input, source);
            }
        } catch (IOException e) {
            throw new StorageEngineException("Unable to read from " + input, e);
        }

        // Transformer
        VcfMeta meta = new VcfMeta(source);
        ArchiveHelper helper = new ArchiveHelper(conf, meta);
        ProgressLogger progressLogger = new ProgressLogger("Transform proto:").setBatchSize(100000);

        logger.info("Generating output file {}", outputVariantsFile);

        long start = System.currentTimeMillis();
        long end;
        // FIXME
        if (options.getBoolean("transform.proto.parallel")) {
            VariantSliceReader sliceReader = new VariantSliceReader(helper.getChunkSize(), dataReader);

            // Use a supplier to avoid concurrent modifications of non thread safe objects.
            Supplier<ParallelTaskRunner.TaskWithException<ImmutablePair<Long, List<Variant>>, VcfSliceProtos.VcfSlice, ?>> supplier = () -> {
                VariantToVcfSliceConverter converter = new VariantToVcfSliceConverter();
                return batch -> {
                    List<VcfSliceProtos.VcfSlice> slices = new ArrayList<>(batch.size());
                    for (ImmutablePair<Long, List<Variant>> pair : batch) {
                        slices.add(converter.convert(pair.getRight(), pair.getLeft().intValue()));
                        progressLogger.increment(pair.getRight().size());
                    }
                    return slices;
                };
            };

            ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder()
                    .setNumTasks(options.getInt(Options.TRANSFORM_THREADS.key(), 1)).setBatchSize(1)
                    .setAbortOnFail(true).setSorted(false).setCapacity(1).build();

            ParallelTaskRunner<ImmutablePair<Long, List<Variant>>, VcfSliceProtos.VcfSlice> ptr;
            ptr = new ParallelTaskRunner<>(sliceReader, supplier, dataWriter, config);

            try {
                ptr.run();
            } catch (ExecutionException e) {
                throw new StorageEngineException(
                        String.format("Error while Transforming file %s into %s", input, outputVariantsFile), e);
            }
            end = System.currentTimeMillis();
        } else {
            VariantHbaseTransformTask transformTask = new VariantHbaseTransformTask(helper, null);
            long[] t = new long[] { 0, 0, 0 };
            long last = System.nanoTime();

            try {
                dataReader.open();
                dataReader.pre();
                dataWriter.open();
                dataWriter.pre();
                transformTask.pre();
                statsCalculator.pre();

                start = System.currentTimeMillis();
                last = System.nanoTime();
                // Process data
                List<Variant> read = dataReader.read(batchSize);
                t[0] += System.nanoTime() - last;
                last = System.nanoTime();
                while (!read.isEmpty()) {
                    progressLogger.increment(read.size());
                    statsCalculator.apply(read);
                    List<VcfSliceProtos.VcfSlice> slices = transformTask.apply(read);
                    t[1] += System.nanoTime() - last;
                    last = System.nanoTime();
                    dataWriter.write(slices);
                    t[2] += System.nanoTime() - last;
                    last = System.nanoTime();
                    read = dataReader.read(batchSize);
                    t[0] += System.nanoTime() - last;
                    last = System.nanoTime();
                }
                List<VcfSliceProtos.VcfSlice> drain = transformTask.drain();
                t[1] += System.nanoTime() - last;
                last = System.nanoTime();
                dataWriter.write(drain);
                t[2] += System.nanoTime() - last;

                end = System.currentTimeMillis();

                source.getMetadata().put(VariantFileUtils.VARIANT_FILE_HEADER, dataReader.getHeader());
                statsCalculator.post();
                transformTask.post();
                dataReader.post();
                dataWriter.post();

                end = System.currentTimeMillis();
                logger.info("Times for reading: {}, transforming {}, writing {}",
                        TimeUnit.NANOSECONDS.toSeconds(t[0]), TimeUnit.NANOSECONDS.toSeconds(t[1]),
                        TimeUnit.NANOSECONDS.toSeconds(t[2]));
            } catch (Exception e) {
                throw new StorageEngineException(
                        String.format("Error while Transforming file %s into %s", input, outputVariantsFile), e);
            } finally {
                dataWriter.close();
                dataReader.close();
            }
        }

        ObjectMapper jsonObjectMapper = new ObjectMapper();
        jsonObjectMapper.addMixIn(VariantSource.class, VariantSourceJsonMixin.class);
        jsonObjectMapper.addMixIn(GenericRecord.class, GenericRecordAvroJsonMixin.class);

        ObjectWriter variantSourceObjectWriter = jsonObjectMapper.writerFor(VariantSource.class);
        try {
            String sourceJsonString = variantSourceObjectWriter.writeValueAsString(source);
            StringDataWriter.write(outputMetaFile, Collections.singletonList(sourceJsonString));
        } catch (IOException e) {
            throw new StorageEngineException("Error writing meta file", e);
        }
        return new ImmutablePair<>(start, end);
    }

    @Override
    public URI preLoad(URI input, URI output) throws StorageEngineException {
        boolean loadArch = options.getBoolean(HADOOP_LOAD_ARCHIVE);
        boolean loadVar = options.getBoolean(HADOOP_LOAD_VARIANT);

        if (!loadArch && !loadVar) {
            loadArch = true;
            loadVar = true;
            options.put(HADOOP_LOAD_ARCHIVE, loadArch);
            options.put(HADOOP_LOAD_VARIANT, loadVar);
        }

        if (loadArch) {
            super.preLoad(input, output);

            if (needLoadFromHdfs() && !input.getScheme().equals("hdfs")) {
                if (!StringUtils.isEmpty(options.getString(OPENCGA_STORAGE_HADOOP_INTERMEDIATE_HDFS_DIRECTORY))) {
                    output = URI.create(options.getString(OPENCGA_STORAGE_HADOOP_INTERMEDIATE_HDFS_DIRECTORY));
                }
                if (output.getScheme() != null && !output.getScheme().equals("hdfs")) {
                    throw new StorageEngineException("Output must be in HDFS");
                }

                try {
                    long startTime = System.currentTimeMillis();
                    //                    Configuration conf = getHadoopConfiguration(options);
                    FileSystem fs = FileSystem.get(conf);
                    org.apache.hadoop.fs.Path variantsOutputPath = new org.apache.hadoop.fs.Path(
                            output.resolve(Paths.get(input.getPath()).getFileName().toString()));
                    logger.info("Copy from {} to {}", new org.apache.hadoop.fs.Path(input).toUri(),
                            variantsOutputPath.toUri());
                    fs.copyFromLocalFile(false, new org.apache.hadoop.fs.Path(input), variantsOutputPath);
                    logger.info("Copied to hdfs in {}s", (System.currentTimeMillis() - startTime) / 1000.0);

                    startTime = System.currentTimeMillis();
                    URI fileInput = URI.create(VariantReaderUtils.getMetaFromTransformedFile(input.toString()));
                    org.apache.hadoop.fs.Path fileOutputPath = new org.apache.hadoop.fs.Path(
                            output.resolve(Paths.get(fileInput.getPath()).getFileName().toString()));
                    logger.info("Copy from {} to {}", new org.apache.hadoop.fs.Path(fileInput).toUri(),
                            fileOutputPath.toUri());
                    fs.copyFromLocalFile(false, new org.apache.hadoop.fs.Path(fileInput), fileOutputPath);
                    logger.info("Copied to hdfs in {}s", (System.currentTimeMillis() - startTime) / 1000.0);

                    input = variantsOutputPath.toUri();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        try {
            ArchiveDriver.createArchiveTableIfNeeded(dbAdaptor.getGenomeHelper(),
                    archiveTableCredentials.getTable(), dbAdaptor.getConnection());
        } catch (IOException e) {
            throw new StorageHadoopException("Issue creating table " + archiveTableCredentials.getTable(), e);
        }
        try {
            VariantTableDriver.createVariantTableIfNeeded(dbAdaptor.getGenomeHelper(),
                    variantsTableCredentials.getTable(), dbAdaptor.getConnection());
        } catch (IOException e) {
            throw new StorageHadoopException("Issue creating table " + variantsTableCredentials.getTable(), e);
        }

        if (loadVar) {
            preMerge(input);
        }

        return input;
    }

    protected void preMerge(URI input) throws StorageEngineException {
        int studyId = getStudyId();

        VariantPhoenixHelper phoenixHelper = new VariantPhoenixHelper(dbAdaptor.getGenomeHelper());
        try {
            Connection jdbcConnection = dbAdaptor.getJdbcConnection();
            String tableName = variantsTableCredentials.getTable();
            phoenixHelper.registerNewStudy(jdbcConnection, tableName, studyId);
            if (!options.getBoolean(SKIP_CREATE_PHOENIX_INDEXES, false)) {
                if (options.getString(VariantAnnotationManager.SPECIES, "hsapiens").equalsIgnoreCase("hsapiens")) {
                    List<PhoenixHelper.Column> columns = VariantPhoenixHelper
                            .getHumanPopulationFrequenciesColumns();
                    phoenixHelper.getPhoenixHelper().addMissingColumns(jdbcConnection, tableName, columns, true);
                    List<PhoenixHelper.Index> popFreqIndices = VariantPhoenixHelper.getPopFreqIndices(tableName);
                    phoenixHelper.getPhoenixHelper().createIndexes(jdbcConnection, tableName, popFreqIndices,
                            false);
                }
                phoenixHelper.createVariantIndexes(jdbcConnection, tableName);
            } else {
                logger.info("Skip create indexes!!");
            }
        } catch (SQLException e) {
            throw new StorageEngineException("Unable to register study in Phoenix", e);
        }

        long lock = dbAdaptor.getStudyConfigurationManager().lockStudy(studyId);

        //Get the studyConfiguration. If there is no StudyConfiguration, create a empty one.
        try {
            StudyConfiguration studyConfiguration = checkOrCreateStudyConfiguration(true);
            VariantSource source = readVariantSource(input, options);
            securePreMerge(studyConfiguration, source);
            dbAdaptor.getStudyConfigurationManager().updateStudyConfiguration(studyConfiguration, null);
        } finally {
            dbAdaptor.getStudyConfigurationManager().unLockStudy(studyId, lock);
        }

    }

    protected void securePreMerge(StudyConfiguration studyConfiguration, VariantSource source)
            throws StorageEngineException {

        boolean loadArch = options.getBoolean(HADOOP_LOAD_ARCHIVE);
        boolean loadVar = options.getBoolean(HADOOP_LOAD_VARIANT);

        if (loadVar) {
            // Load into variant table
            // Update the studyConfiguration with data from the Archive Table.
            // Reads the VcfMeta documents, and populates the StudyConfiguration if needed.
            // Obtain the list of pending files.

            int studyId = options.getInt(VariantStorageEngine.Options.STUDY_ID.key(), -1);
            int fileId = options.getInt(VariantStorageEngine.Options.FILE_ID.key(), -1);
            boolean missingFilesDetected = false;

            HadoopVariantSourceDBAdaptor fileMetadataManager = dbAdaptor.getVariantSourceDBAdaptor();
            Set<Integer> files = null;
            try {
                files = fileMetadataManager.getLoadedFiles(studyId);
            } catch (IOException e) {
                throw new StorageHadoopException("Unable to read loaded files", e);
            }

            logger.info("Found files in Archive DB: " + files);

            // Pending files, not in analysis but in archive.
            List<Integer> pendingFiles = new LinkedList<>();
            logger.info("Found registered indexed files: {}", studyConfiguration.getIndexedFiles());
            for (Integer loadedFileId : files) {
                VariantSource readSource;
                try {
                    readSource = fileMetadataManager.getVariantSource(studyId, loadedFileId, null);
                } catch (IOException e) {
                    throw new StorageHadoopException("Unable to read file VcfMeta for file : " + loadedFileId, e);
                }

                Integer readFileId = Integer.parseInt(readSource.getFileId());
                logger.debug("Found source for file id {} with registered id {} ", loadedFileId, readFileId);
                if (!studyConfiguration.getFileIds().inverse().containsKey(readFileId)) {
                    checkNewFile(studyConfiguration, readFileId, readSource.getFileName());
                    studyConfiguration.getFileIds().put(readSource.getFileName(), readFileId);
                    //                    studyConfiguration.getHeaders().put(readFileId, readSource.getMetadata()
                    //                            .get(VariantFileUtils.VARIANT_FILE_HEADER).toString());
                    checkAndUpdateStudyConfiguration(studyConfiguration, readFileId, readSource, options);
                    missingFilesDetected = true;
                }
                if (!studyConfiguration.getIndexedFiles().contains(readFileId)) {
                    pendingFiles.add(readFileId);
                }
            }
            logger.info("Found pending in DB: " + pendingFiles);

            fileId = checkNewFile(studyConfiguration, fileId, source.getFileName());

            if (!loadArch) {
                //If skip archive loading, input fileId must be already in archiveTable, so "pending to be loaded"
                if (!pendingFiles.contains(fileId)) {
                    throw new StorageEngineException("File " + fileId + " is not loaded in archive table "
                            + getArchiveTableName(studyId, options) + "");
                }
            } else {
                //If don't skip archive, input fileId must not be pending, because must not be in the archive table.
                if (pendingFiles.contains(fileId)) {
                    // set loadArch to false?
                    throw new StorageEngineException("File " + fileId + " is not loaded in archive table");
                } else {
                    pendingFiles.add(fileId);
                }
            }

            //If there are some given pending files, load only those files, not all pending files
            List<Integer> givenPendingFiles = options.getAsIntegerList(HADOOP_LOAD_VARIANT_PENDING_FILES);
            if (!givenPendingFiles.isEmpty()) {
                logger.info("Given Pending file list: " + givenPendingFiles);
                for (Integer pendingFile : givenPendingFiles) {
                    if (!pendingFiles.contains(pendingFile)) {
                        throw new StorageEngineException(
                                "File " + pendingFile + " is not pending to be loaded in variant table");
                    }
                }
                pendingFiles = givenPendingFiles;
            } else {
                options.put(HADOOP_LOAD_VARIANT_PENDING_FILES, pendingFiles);
            }

            boolean resume = options.getBoolean(Options.RESUME.key(), Options.RESUME.defaultValue())
                    || options.getBoolean(HadoopVariantStorageEngine.HADOOP_LOAD_VARIANT_RESUME, false);
            BatchFileOperation op = addBatchOperation(studyConfiguration, VariantTableDriver.JOB_OPERATION_NAME,
                    pendingFiles, resume, BatchFileOperation.Type.LOAD);
            options.put(HADOOP_LOAD_VARIANT_STATUS, op.currentStatus());
            options.put(AbstractVariantTableDriver.TIMESTAMP, op.getTimestamp());

        }
    }

    /**
     * Adds a new {@BatchOperation} to the StudyConfiguration.
     *
     * Only allow one running operation at the same time
     * If the last operation is ready, continue
     * If the last operation is in ERROR, continue if is the same operation and files.
     * If the last operation is running, continue only if resume=true
     *
     * If is a new operation, increment the TimeStamp
     *
     * @param studyConfiguration StudyConfiguration
     * @param jobOperationName   Job operation name used to create the jobName and as {@link BatchFileOperation#operationName}
     * @param fileIds            Files to be processed in this batch.
     * @param resume             Resume operation. Assume that previous operation went wrong.
     * @param type               Operation type as {@link BatchFileOperation#type}
     * @return                   The current batchOperation
     * @throws StorageEngineException if the operation can't be executed
     */
    protected BatchFileOperation addBatchOperation(StudyConfiguration studyConfiguration, String jobOperationName,
            List<Integer> fileIds, boolean resume, BatchFileOperation.Type type) throws StorageEngineException {

        List<BatchFileOperation> batches = studyConfiguration.getBatches();
        BatchFileOperation batchFileOperation;
        boolean newOperation = false;
        if (!batches.isEmpty()) {
            batchFileOperation = batches.get(batches.size() - 1);
            BatchFileOperation.Status currentStatus = batchFileOperation.currentStatus();

            switch (currentStatus) {
            case READY:
                batchFileOperation = new BatchFileOperation(jobOperationName, fileIds,
                        batchFileOperation.getTimestamp() + 1, type);
                newOperation = true;
                break;
            case DONE:
            case RUNNING:
                if (!resume) {
                    throw new StorageHadoopException(
                            "Unable to process a new batch. Ongoing batch operation: " + batchFileOperation);
                }
                // DO NOT BREAK!. Resuming last loading, go to error case.
            case ERROR:
                Collections.sort(fileIds);
                Collections.sort(batchFileOperation.getFileIds());
                if (batchFileOperation.getFileIds().equals(fileIds)) {
                    logger.info("Resuming Last batch loading due to error.");
                } else {
                    throw new StorageHadoopException("Unable to resume last batch operation. "
                            + "Must have the same files from the previous batch: " + batchFileOperation);
                }
                break;
            default:
                throw new IllegalArgumentException("Unknown Status " + currentStatus);
            }
        } else {
            batchFileOperation = new BatchFileOperation(jobOperationName, fileIds, 1, type);
            newOperation = true;
        }
        if (!Objects.equals(batchFileOperation.currentStatus(), BatchFileOperation.Status.DONE)) {
            batchFileOperation.addStatus(Calendar.getInstance().getTime(), BatchFileOperation.Status.RUNNING);
        }
        if (newOperation) {
            batches.add(batchFileOperation);
        }
        return batchFileOperation;
    }

    /**
     * Specify if the current class needs to move the file to load to HDFS.
     *
     * If true, the transformed file will be copied to hdfs during the {@link #preLoad}
     *
     * @return boolean
     */
    protected abstract boolean needLoadFromHdfs();

    @Override
    public URI load(URI input) throws IOException, StorageEngineException {
        int studyId = getStudyId();
        int fileId = options.getInt(Options.FILE_ID.key());

        boolean loadArch = options.getBoolean(HADOOP_LOAD_ARCHIVE);
        boolean loadVar = options.getBoolean(HADOOP_LOAD_VARIANT);

        ArchiveHelper.setChunkSize(conf,
                conf.getInt(ArchiveDriver.CONFIG_ARCHIVE_CHUNK_SIZE, ArchiveDriver.DEFAULT_CHUNK_SIZE));
        ArchiveHelper.setStudyId(conf, studyId);

        if (loadArch) {
            Set<Integer> loadedFiles = dbAdaptor.getVariantSourceDBAdaptor().getLoadedFiles(studyId);
            if (!loadedFiles.contains(fileId)) {
                loadArch(input);
            } else {
                logger.info("File {} already loaded in archive table. Skip this step!",
                        Paths.get(input.getPath()).getFileName().toString());
            }
        }

        if (loadVar) {
            List<Integer> pendingFiles = options.getAsIntegerList(HADOOP_LOAD_VARIANT_PENDING_FILES);
            merge(studyId, pendingFiles);
        }

        return input; // TODO  change return value?
    }

    protected abstract void loadArch(URI input) throws StorageEngineException;

    public void merge(int studyId, List<Integer> pendingFiles) throws StorageEngineException {
        // Check if status is "DONE"
        if (options.get(HADOOP_LOAD_VARIANT_STATUS, BatchFileOperation.Status.class)
                .equals(BatchFileOperation.Status.DONE)) {
            // Merge operation status : DONE, not READY or RUNNING
            // Don't need to merge again. Skip merge and run post-load/post-merge step
            logger.info("Files {} already merged!", pendingFiles);
            return;
        }
        String hadoopRoute = options.getString(HADOOP_BIN, "hadoop");
        String jar = getJarWithDependencies();
        options.put(HADOOP_LOAD_VARIANT_PENDING_FILES, pendingFiles);

        Class execClass = VariantTableDriver.class;
        String args = VariantTableDriver.buildCommandLineArgs(variantsTableCredentials.toString(),
                archiveTableCredentials.getTable(), variantsTableCredentials.getTable(), studyId, pendingFiles,
                options);
        String executable = hadoopRoute + " jar " + jar + ' ' + execClass.getName();

        long startTime = System.currentTimeMillis();
        Thread hook = newShutdownHook(VariantTableDriver.JOB_OPERATION_NAME, pendingFiles);
        Runtime.getRuntime().addShutdownHook(hook);
        try {
            logger.info("------------------------------------------------------");
            logger.info("Loading files {} into analysis table '{}'", pendingFiles,
                    variantsTableCredentials.getTable());
            logger.info(executable + " " + args);
            logger.info("------------------------------------------------------");
            int exitValue = mrExecutor.run(executable, args);
            logger.info("------------------------------------------------------");
            logger.info("Exit value: {}", exitValue);
            logger.info("Total time: {}s", (System.currentTimeMillis() - startTime) / 1000.0);
            if (exitValue != 0) {
                throw new StorageEngineException("Error loading files " + pendingFiles + " into variant table \""
                        + variantsTableCredentials.getTable() + "\"");
            }
            setStatus(BatchFileOperation.Status.DONE, VariantTableDriver.JOB_OPERATION_NAME, pendingFiles);
        } catch (Exception e) {
            setStatus(BatchFileOperation.Status.ERROR, VariantTableDriver.JOB_OPERATION_NAME, pendingFiles);
            throw e;
        } finally {
            Runtime.getRuntime().removeShutdownHook(hook);
        }
    }

    public String getJarWithDependencies() throws StorageEngineException {
        return getJarWithDependencies(options);
    }

    public static String getJarWithDependencies(ObjectMap options) throws StorageEngineException {
        String jar = options.getString(OPENCGA_STORAGE_HADOOP_JAR_WITH_DEPENDENCIES, null);
        if (jar == null) {
            throw new StorageEngineException("Missing option " + OPENCGA_STORAGE_HADOOP_JAR_WITH_DEPENDENCIES);
        }
        if (!Paths.get(jar).isAbsolute()) {
            jar = System.getProperty("app.home", "") + "/" + jar;
        }
        return jar;
    }

    @Override
    protected void checkLoadedVariants(URI input, int fileId, StudyConfiguration studyConfiguration)
            throws StorageEngineException {
        logger.warn("Skip check loaded variants");
    }

    @Override
    public URI postLoad(URI input, URI output) throws StorageEngineException {
        if (options.getBoolean(HADOOP_LOAD_VARIANT)) {
            // Current StudyConfiguration may be outdated. Remove it.
            options.remove(VariantStorageEngine.Options.STUDY_CONFIGURATION.key());

            //            HadoopCredentials dbCredentials = getDbCredentials();
            //            VariantHadoopDBAdaptor dbAdaptor = getDBAdaptor(dbCredentials);

            options.put(VariantStorageEngine.Options.FILE_ID.key(),
                    options.getAsIntegerList(HADOOP_LOAD_VARIANT_PENDING_FILES));

            return super.postLoad(input, output);
        } else {
            logger.debug("Skip post load");
            return input;
        }
    }

    @Override
    public void securePostLoad(List<Integer> fileIds, StudyConfiguration studyConfiguration)
            throws StorageEngineException {
        super.securePostLoad(fileIds, studyConfiguration);
        BatchFileOperation.Status status = secureSetStatus(studyConfiguration, BatchFileOperation.Status.READY,
                VariantTableDriver.JOB_OPERATION_NAME, fileIds);
        if (status != BatchFileOperation.Status.DONE) {
            logger.warn("Unexpected status " + status);
        }
    }

}