Java tutorial
/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.hadoop.variant; import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.protobuf.VcfMeta; import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos.VcfSlice; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.core.common.ProgressLogger; import org.opencb.opencga.storage.core.config.StorageConfiguration; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.VariantStorageEngine.Options; import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils; import org.opencb.opencga.storage.hadoop.auth.HBaseCredentials; import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveHelper; import org.opencb.opencga.storage.hadoop.variant.archive.VariantHbasePutTask; import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor; import org.slf4j.LoggerFactory; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collections; import java.util.zip.GZIPInputStream; /** * @author Matthias Haimel mh719+git@cam.ac.uk */ public class HadoopDirectVariantStoragePipeline extends AbstractHadoopVariantStoragePipeline { /** * @param configuration {@link StorageConfiguration} * @param storageEngineId Id * @param dbAdaptor {@link VariantHadoopDBAdaptor} * @param mrExecutor {@link MRExecutor} * @param conf {@link Configuration} * @param archiveCredentials {@link HBaseCredentials} * @param variantReaderUtils {@link VariantReaderUtils} * @param options {@link ObjectMap} */ public HadoopDirectVariantStoragePipeline(StorageConfiguration configuration, String storageEngineId, VariantHadoopDBAdaptor dbAdaptor, MRExecutor mrExecutor, Configuration conf, HBaseCredentials archiveCredentials, VariantReaderUtils variantReaderUtils, ObjectMap options) { super(configuration, storageEngineId, LoggerFactory.getLogger(HadoopDirectVariantStoragePipeline.class), dbAdaptor, variantReaderUtils, options, archiveCredentials, mrExecutor, conf); } @Override public URI preTransform(URI input) throws StorageEngineException, IOException, FileFormatException { if (StringUtils.isEmpty(options.getString(Options.TRANSFORM_FORMAT.key()))) { options.put(Options.TRANSFORM_FORMAT.key(), "proto"); } return super.preTransform(input); } /** * Read from VCF file, group by slice and insert into HBase table. * * @param inputUri {@link URI} * @throws StorageEngineException if the load fails */ protected void loadArch(URI inputUri) throws StorageEngineException { Path input = Paths.get(inputUri.getPath()); String table = archiveTableCredentials.getTable(); String fileName = input.getFileName().toString(); Path sourcePath = input.getParent().resolve(VariantReaderUtils.getMetaFromTransformedFile(fileName)); if (!VariantReaderUtils.isProto(fileName)) { throw new NotImplementedException("Direct loading only available for PROTO files."); } StudyConfiguration studyConfiguration = getStudyConfiguration(); Integer fileId; if (options.getBoolean(Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.key(), Options.ISOLATE_FILE_FROM_STUDY_CONFIGURATION.defaultValue())) { fileId = Options.FILE_ID.defaultValue(); } else { fileId = options.getInt(Options.FILE_ID.key()); } int studyId = getStudyId(); VariantSource source = VariantReaderUtils.readVariantSource(sourcePath, null); source.setFileId(fileId.toString()); source.setStudyId(Integer.toString(studyId)); VcfMeta meta = new VcfMeta(source); ArchiveHelper helper = new ArchiveHelper(dbAdaptor.getGenomeHelper(), meta); ProgressLogger progressLogger = new ProgressLogger("Loaded slices:", source.getStats() != null ? source.getStats().getNumRecords() : 0); VariantHbasePutTask hbaseWriter = new VariantHbasePutTask(helper, table); long counter = 0; long start = System.currentTimeMillis(); try (InputStream in = new BufferedInputStream(new GZIPInputStream(new FileInputStream(input.toFile())))) { hbaseWriter.open(); hbaseWriter.pre(); VcfSlice slice = VcfSlice.parseDelimitedFrom(in); while (null != slice) { ++counter; hbaseWriter.write(slice); progressLogger.increment(slice.getRecordsCount()); slice = VcfSlice.parseDelimitedFrom(in); } hbaseWriter.post(); } catch (IOException e) { throw new StorageEngineException("Problems reading " + input, e); } finally { hbaseWriter.close(); } long end = System.currentTimeMillis(); logger.info("Read {} slices", counter); logger.info("end - start = " + (end - start) / 1000.0 + "s"); HadoopVariantSourceDBAdaptor manager = dbAdaptor.getVariantSourceDBAdaptor(); try { manager.updateVariantSource(source); manager.updateLoadedFilesSummary(studyId, Collections.singletonList(fileId)); } catch (IOException e) { throw new StorageEngineException("Not able to store Variant Source for file!!!", e); } } @Override protected boolean needLoadFromHdfs() { return false; } }