org.opencb.opencga.storage.hadoop.variant.HadoopDirectVariantStoragePipeline.java Source code

Introduction

Here is the source code for org.opencb.opencga.storage.hadoop.variant.HadoopDirectVariantStoragePipeline.java
Source

/*
 * Copyright 2015-2016 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.opencga.storage.hadoop.variant;

import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.opencb.biodata.formats.io.FileFormatException;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.protobuf.VcfMeta;
import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos.VcfSlice;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.opencga.core.common.ProgressLogger;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
import org.opencb.opencga.storage.hadoop.auth.HBaseCredentials;
import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper;
import org.opencb.opencga.storage.hadoop.variant.archive.VariantHbasePutTask;
import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor;
import org.slf4j.LoggerFactory;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.zip.GZIPInputStream;

/**
 * @author Matthias Haimel mh719+git@cam.ac.uk
 */
public class HadoopDirectVariantStoragePipeline extends AbstractHadoopVariantStoragePipeline {

    /**
     * @param configuration      {@link StorageConfiguration}
     * @param storageEngineId    Id
     * @param dbAdaptor          {@link VariantHadoopDBAdaptor}
     * @param mrExecutor         {@link MRExecutor}
     * @param conf               {@link Configuration}
     * @param archiveCredentials {@link HBaseCredentials}
     * @param variantReaderUtils {@link VariantReaderUtils}
     * @param options            {@link ObjectMap}
     */
    public HadoopDirectVariantStoragePipeline(StorageConfiguration configuration, String storageEngineId,
            VariantHadoopDBAdaptor dbAdaptor, MRExecutor mrExecutor, Configuration conf,
            HBaseCredentials archiveCredentials, VariantReaderUtils variantReaderUtils, ObjectMap options) {
        super(configuration, storageEngineId, LoggerFactory.getLogger(HadoopDirectVariantStoragePipeline.class),
                dbAdaptor, variantReaderUtils, options, archiveCredentials, mrExecutor, conf);
    }

    @Override
    public URI preTransform(URI input) throws StorageEngineException, IOException, FileFormatException {
        if (StringUtils.isEmpty(options.getString(Options.TRANSFORM_FORMAT.key()))) {
            options.put(Options.TRANSFORM_FORMAT.key(), "proto");
        }
        return super.preTransform(input);
    }

    /**
     * Read from VCF file, group by slice and insert into HBase table.
     *
     * @param inputUri {@link URI}
     * @throws StorageEngineException if the load fails
     */
    protected void loadArch(URI inputUri) throws StorageEngineException {
        Path input = Paths.get(inputUri.getPath());
        String table = archiveTableCredentials.getTable();
        String fileName = input.getFileName().toString();
        Path sourcePath = input.getParent().resolve(VariantReaderUtils.getMetaFromTransformedFile(fileName));

        if (!VariantReaderUtils.isProto(fileName)) {
            throw new NotImplementedException("Direct loading only available for PROTO files.");
        }

        StudyConfiguration studyConfiguration = getStudyConfiguration();
        Integer fileId;
        if (options.getBoolean(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key(),
                Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.defaultValue())) {
            fileId = Options.FILE_ID.defaultValue();
        } else {
            fileId = options.getInt(Options.FILE_ID.key());
        }
        int studyId = getStudyId();

        VariantSource source = VariantReaderUtils.readVariantSource(sourcePath, null);
        source.setFileId(fileId.toString());
        source.setStudyId(Integer.toString(studyId));
        VcfMeta meta = new VcfMeta(source);
        ArchiveHelper helper = new ArchiveHelper(dbAdaptor.getGenomeHelper(), meta);

        ProgressLogger progressLogger = new ProgressLogger("Loaded slices:",
                source.getStats() != null ? source.getStats().getNumRecords() : 0);
        VariantHbasePutTask hbaseWriter = new VariantHbasePutTask(helper, table);
        long counter = 0;
        long start = System.currentTimeMillis();
        try (InputStream in = new BufferedInputStream(new GZIPInputStream(new FileInputStream(input.toFile())))) {
            hbaseWriter.open();
            hbaseWriter.pre();
            VcfSlice slice = VcfSlice.parseDelimitedFrom(in);
            while (null != slice) {
                ++counter;
                hbaseWriter.write(slice);
                progressLogger.increment(slice.getRecordsCount());
                slice = VcfSlice.parseDelimitedFrom(in);
            }
            hbaseWriter.post();
        } catch (IOException e) {
            throw new StorageEngineException("Problems reading " + input, e);
        } finally {
            hbaseWriter.close();
        }
        long end = System.currentTimeMillis();
        logger.info("Read {} slices", counter);
        logger.info("end - start = " + (end - start) / 1000.0 + "s");

        HadoopVariantSourceDBAdaptor manager = dbAdaptor.getVariantSourceDBAdaptor();
        try {
            manager.updateVariantSource(source);
            manager.updateLoadedFilesSummary(studyId, Collections.singletonList(fileId));
        } catch (IOException e) {
            throw new StorageEngineException("Not able to store Variant Source for file!!!", e);
        }
    }

    @Override
    protected boolean needLoadFromHdfs() {
        return false;
    }

}