Java tutorial
/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.hadoop.variant; import java.io.IOException; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.util.Bytes; import org.opencb.biodata.formats.variant.vcf4.VcfUtils; import org.opencb.biodata.models.feature.Genotype; import org.opencb.biodata.models.variant.VariantSourceEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.effect.VariantEffect; import org.opencb.biodata.models.variant.protobuf.VariantProtos; import org.opencb.biodata.models.variant.protobuf.VariantStatsProtos; import org.opencb.biodata.models.variant.stats.VariantStats; import org.opencb.commons.utils.CryptoUtils; import org.opencb.opencga.core.auth.MonbaseCredentials; import org.opencb.opencga.storage.core.variant.io.VariantDBWriter; import org.opencb.opencga.storage.core.variant.io.VariantEffectProtos; /** * @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk> * @author Jesus Rodriguez <jesusrodrc@gmail.com> */ public class VariantHbaseWriter extends VariantDBWriter { private final byte[] infoColumnFamily = "i".getBytes(); private final byte[] dataColumnFamily = "d".getBytes(); private String tableName; private VariantSource source; private HBaseAdmin admin; private HTable variantTable; private HTable effectTable; private Map<String, Put> putMap; private Map<String, Put> effectPutMap; private MonbaseCredentials credentials; private boolean includeStats; private boolean includeEffect; private boolean includeSamples; private VariantStatsToHbaseConverter statsConverter; public VariantHbaseWriter(VariantSource source, String species, MonbaseCredentials credentials) { this(source, species, credentials, false, false, false); } public VariantHbaseWriter(VariantSource source, String species, MonbaseCredentials credentials, boolean includeSamples, boolean includeStats, boolean includeEffect) { if (credentials == null) { throw new IllegalArgumentException("Credentials for accessing the database must be specified"); } this.source = source; this.tableName = species; this.putMap = new HashMap<>(); this.effectPutMap = new HashMap<>(); this.credentials = credentials; this.includeSamples = includeSamples; this.includeStats = includeStats; this.includeEffect = includeEffect; if (this.includeStats) { statsConverter = new VariantStatsToHbaseConverter(); } } @Override public boolean open() { try { // HBase configuration Configuration config = HBaseConfiguration.create(); config.set("hbase.master", credentials.getHbaseMasterHost() + ":" + credentials.getHbaseMasterPort()); config.set("hbase.zookeeper.quorum", credentials.getHbaseZookeeperQuorum()); config.set("hbase.zookeeper.property.clientPort", String.valueOf(credentials.getHbaseZookeeperClientPort())); admin = new HBaseAdmin(config); } catch (MasterNotRunningException | ZooKeeperConnectionException ex) { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.SEVERE, null, ex); return false; } catch (IOException ex) { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.SEVERE, null, ex); return false; } return admin != null; } @Override public boolean pre() { try { // HBase variant table creation (one per species) if (!admin.tableExists(tableName)) { HTableDescriptor newTable = new HTableDescriptor(tableName.getBytes()); // Add column family for samples HColumnDescriptor samplesDescriptor = new HColumnDescriptor(dataColumnFamily); samplesDescriptor.setCompressionType(Compression.Algorithm.SNAPPY); newTable.addFamily(samplesDescriptor); // Add column family for the raw main columns and statistics HColumnDescriptor statsDescriptor = new HColumnDescriptor(infoColumnFamily); statsDescriptor.setCompressionType(Compression.Algorithm.SNAPPY); newTable.addFamily(statsDescriptor); // Create table admin.createTable(newTable); } variantTable = new HTable(admin.getConfiguration(), tableName); variantTable.setAutoFlush(false, true); // HBase effect table creation (one per species) String tableEffectName = tableName + "effect"; if (!admin.tableExists(tableEffectName)) { HTableDescriptor newEffectTable = new HTableDescriptor(tableEffectName.getBytes()); // Add column family for effect HColumnDescriptor effectDescriptor = new HColumnDescriptor("e".getBytes()); effectDescriptor.setCompressionType(Compression.Algorithm.SNAPPY); newEffectTable.addFamily(effectDescriptor); // Create effect table admin.createTable(newEffectTable); } effectTable = new HTable(admin.getConfiguration(), tableEffectName); effectTable.setAutoFlush(false, true); return variantTable != null && effectTable != null; } catch (IOException ex) { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.SEVERE, null, ex); return false; } } @Override public boolean write(Variant variant) { return write(Arrays.asList(variant)); } @Override public boolean write(List<Variant> data) { buildBatchRaw(data); if (this.includeEffect) { buildEffectRaw(data); } buildBatchIndex(data); return writeBatch(data); } @Override protected boolean writeBatch(List<Variant> data) { // TODO Better error checking! Probably doing more variant-by-variant inserts try { // Insert raw variant data // TODO Track which ones were successful variantTable.put(new LinkedList(putMap.values())); putMap.clear(); // Insert effect raw data // TODO Track which ones were successful effectTable.put(new LinkedList(effectPutMap.values())); effectPutMap.clear(); } catch (IOException e) { return false; } return true; } @Override protected boolean buildBatchIndex(List<Variant> data) { return true; } @Override protected boolean buildBatchRaw(List<Variant> data) { // Query all variants in the batch instead of one by one List<Get> gets = new ArrayList<>(data.size()); Result[] results; try { for (Variant v : data) { String rowkey = buildRowkey(v); gets.add(new Get(Bytes.toBytes(rowkey))); } results = variantTable.get(gets); } catch (IOException ex) { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.SEVERE, "Could not retrieve variant rowkeys from database", ex); return false; } int i = 0; for (Variant v : data) { // Check that this variant was not stored yet if (results[i].isEmpty()) { String rowkey = buildRowkey(v); // Create raw data for inserting in HBase for (VariantSourceEntry archiveFile : v.getSourceEntries().values()) { String prefix = source.getStudyId() + "_" + source.getFileId(); // Check that this variant IN THIS FILE was not stored yet // (look for the column containing the file fields) byte[] attrsBytes = Bytes.toBytes(prefix + "_" + "_attrs"); if (results[i].containsColumn(dataColumnFamily, attrsBytes)) { continue; } Put auxPut = putMap.get(rowkey); if (auxPut == null) { auxPut = new Put(Bytes.toBytes(rowkey)); putMap.put(rowkey, auxPut); } // Global fields (chr, start, ref, alt...) // chr, start, end, ref, alt, id, type, length, hgvs auxPut.add(dataColumnFamily, Bytes.toBytes("chr"), Bytes.toBytes(v.getChromosome())); auxPut.add(dataColumnFamily, Bytes.toBytes("start"), Bytes.toBytes(v.getStart())); auxPut.add(dataColumnFamily, Bytes.toBytes("end"), Bytes.toBytes(v.getEnd())); auxPut.add(dataColumnFamily, Bytes.toBytes("length"), Bytes.toBytes(v.getLength())); auxPut.add(dataColumnFamily, Bytes.toBytes("ref"), Bytes.toBytes(v.getReference())); auxPut.add(dataColumnFamily, Bytes.toBytes("alt"), Bytes.toBytes(v.getAlternate())); auxPut.add(dataColumnFamily, Bytes.toBytes("id"), Bytes.toBytes(v.getId())); auxPut.add(dataColumnFamily, Bytes.toBytes("type"), Bytes.toBytes(v.getType().ordinal())); // TODO How are we going to store HGVS really? It is available in VEP // auxPut.add(dataColumnFamily, Bytes.toBytes("hgvs"), Bytes.toBytes(v.getHgvs())); // Attributes that vary depending on the input format VariantProtos.VariantFileAttributes attrs = buildAttributesProto(v, archiveFile); auxPut.add(dataColumnFamily, attrsBytes, attrs.toByteArray()); if (includeSamples) { for (String s : archiveFile.getSampleNames()) { VariantProtos.VariantSample.Builder sp = VariantProtos.VariantSample.newBuilder(); sp.setSample(VcfUtils.getJoinedSampleFields(v, archiveFile, s)); byte[] qualifier = Bytes.toBytes(prefix + "_" + s); auxPut.add(dataColumnFamily, qualifier, sp.build().toByteArray()); } } if (includeStats) { VariantStatsProtos.VariantStats protoStats = statsConverter .convertToStorageType(archiveFile.getStats()); byte[] qualifier = Bytes.toBytes(prefix + "_stats"); auxPut.add(dataColumnFamily, qualifier, protoStats.toByteArray()); } } } else { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.WARNING, "Variant already existed: {0}:{1}", new Object[] { v.getChromosome(), v.getStart() }); } } return true; } // @Override // protected boolean buildStatsRaw(List<Variant> data) { // for (Variant v : data) { // for (VariantSourceEntry archiveFile : v.getFiles().values()) { // VariantStats s = archiveFile.getStats(); // VariantStatsProtos.VariantStats stats = buildStatsProto(s); // // String rowkey = buildRowkey(v); // Put put2 = putMap.get(rowkey); // if (put2 != null) { // This variant is not being processed // String prefix = source.getStudyId() + "_" + source.getFileId(); // byte[] qualifier = Bytes.toBytes(prefix + "_stats"); // put2.add(dataColumnFamily, qualifier, stats.toByteArray()); // } // } // } // // return true; // } @Override protected boolean buildEffectRaw(List<Variant> variants) { // for (Variant variant : variants) { // for (VariantEffect v : variant.getEffect()) { // String rowkey = buildRowkey(v.getChromosome(), String.valueOf(v.getPosition())); // VariantEffectProtos.EffectInfo effectProto = buildEffectProto(v); // String qualifier = v.getReferenceAllele() + "_" + v.getAlternativeAllele(); // // // TODO Insert in the map for HBase storage // // Put effectPut = new Put(Bytes.toBytes(rowkey)); // // effectPut.add("e".getBytes(), qualifier.getBytes(), effectProto.toByteArray()); // // effectPutMap.put(rowkey, effectPut); // } // } return true; } @Override public boolean post() { try { variantTable.flushCommits(); effectTable.flushCommits(); } catch (IOException ex) { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.SEVERE, null, ex); return false; } return true; } @Override public boolean close() { try { admin.close(); variantTable.close(); effectTable.close(); } catch (IOException e) { Logger.getLogger(VariantHbaseWriter.class.getName()).log(Level.SEVERE, null, e); return false; } return true; } /* * ProtocolBuffers objects construction */ private VariantProtos.VariantFileAttributes buildAttributesProto(Variant v, VariantSourceEntry file) { VariantProtos.VariantFileAttributes.Builder builder = VariantProtos.VariantFileAttributes.newBuilder(); for (Map.Entry<String, String> attr : file.getAttributes().entrySet()) { VariantProtos.VariantFileAttributes.KeyValue.Builder kvBuilder = VariantProtos.VariantFileAttributes.KeyValue .newBuilder(); kvBuilder.setKey(attr.getKey()); kvBuilder.setValue(attr.getValue()); builder.addAttrs(kvBuilder.build()); } return builder.build(); } private VariantStatsProtos.VariantStats buildStatsProto(VariantStats v) { VariantStatsProtos.VariantStats.Builder builder = VariantStatsProtos.VariantStats.newBuilder(); builder.setRefAlleleCount(v.getRefAlleleCount()); builder.setAltAlleleCount(v.getAltAlleleCount()); for (Map.Entry<Genotype, Integer> count : v.getGenotypesCount().entrySet()) { VariantStatsProtos.VariantStats.Count.Builder countBuilder = VariantStatsProtos.VariantStats.Count .newBuilder(); countBuilder.setKey(count.getKey().toString()); countBuilder.setCount(count.getValue()); builder.addGenotypesCount(countBuilder.build()); } builder.setRefAlleleFreq(v.getRefAlleleFreq()); builder.setAltAlleleFreq(v.getAltAlleleFreq()); for (Map.Entry<Genotype, Float> freq : v.getGenotypesFreq().entrySet()) { VariantStatsProtos.VariantStats.Frequency.Builder countBuilder = VariantStatsProtos.VariantStats.Frequency .newBuilder(); countBuilder.setKey(freq.getKey().toString()); countBuilder.setFrequency(freq.getValue()); builder.addGenotypesFreq(countBuilder.build()); } builder.setMissingAlleles(v.getMissingAlleles()); builder.setMissingGenotypes(v.getMissingGenotypes()); builder.setMaf(v.getMaf()); builder.setMgf(v.getMgf()); builder.setPassedFilters(v.hasPassedFilters()); builder.setQuality(v.getQuality()); builder.setNumSamples(v.getNumSamples()); builder.setTransitionsCount(v.getTransitionsCount()); builder.setTransversionsCount(v.getTransversionsCount()); if (v.isPedigreeStatsAvailable()) { // Optional fields, they require pedigree information builder.setMendelianErrors(v.getMendelianErrors()); builder.setCasesPercentDominant(v.getCasesPercentDominant()); builder.setControlsPercentDominant(v.getControlsPercentDominant()); builder.setCasesPercentRecessive(v.getCasesPercentRecessive()); builder.setControlsPercentRecessive(v.getControlsPercentRecessive()); // builder.setHardyWeinberg(effect.getHw().getpValue()); } return builder.build(); } private VariantEffectProtos.EffectInfo buildEffectProto(VariantEffect v) { VariantEffectProtos.EffectInfo.Builder effect = VariantEffectProtos.EffectInfo.newBuilder(); effect.setReference(v.getReferenceAllele()); effect.setAlternative(v.getAlternateAllele()); effect.setChromosome(v.getChromosome()); effect.setPosition(v.getPosition()); effect.setFeatureId(v.getFeatureId()); effect.setFeatureName(v.getFeatureName()); effect.setFeatureBiotype(v.getFeatureBiotype()); effect.setFeatureChromosome(v.getFeatureChromosome()); effect.setFeatureStart(v.getFeatureStart()); effect.setFeatureEnd(v.getFeatureEnd()); effect.setSnpId(v.getSnpId()); effect.setAncestral(v.getAncestral()); effect.setGeneId(v.getGeneId()); effect.setTranscriptId(v.getTranscriptId()); effect.setGeneName(v.getGeneName()); effect.setConsequenceType(v.getConsequenceType()); effect.setConsequenceTypeObo(v.getConsequenceTypeObo()); effect.setConsequenceTypeDesc(v.getConsequenceTypeDesc()); effect.setConsequenceTypeType(v.getConsequenceTypeType()); effect.setAaPosition(v.getAaPosition()); effect.setAminoacidChange(v.getAminoacidChange()); effect.setCodonChange(v.getCodonChange()); return effect.build(); } /* * Auxiliary functions */ private String buildRowkey(Variant v) { StringBuilder builder = new StringBuilder(); builder.append(StringUtils.leftPad(v.getChromosome(), 4, '0')); builder.append("_"); builder.append(v.getStart()); builder.append("_"); if (v.getReference().length() < Variant.SV_THRESHOLD) { builder.append(v.getReference()); } else { builder.append(new String(CryptoUtils.encryptSha1(v.getReference()))); } builder.append("_"); if (v.getAlternate().length() < Variant.SV_THRESHOLD) { builder.append(v.getAlternate()); } else { builder.append(new String(CryptoUtils.encryptSha1(v.getAlternate()))); } return builder.toString(); } @Override public void includeStats(boolean b) { this.includeStats = b; } @Override public void includeSamples(boolean b) { this.includeSamples = b; } @Override public void includeEffect(boolean b) { this.includeEffect = b; } }