Java tutorial
/** * Copyright (c) 2016 DataTorrent, Inc. ALL Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.datatorrent.contrib.hdht; import java.io.IOException; import java.io.OutputStream; import java.io.Serializable; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import javax.validation.constraints.Min; import org.codehaus.jackson.map.annotate.JsonSerialize; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.commons.io.IOUtils; import com.esotericsoftware.kryo.io.Output; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Throwables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.datatorrent.api.Context; import com.datatorrent.api.Context.OperatorContext; import com.datatorrent.api.DAG; import com.datatorrent.api.Operator; import com.datatorrent.api.Operator.CheckpointListener; import com.datatorrent.common.util.NameableThreadFactory; import com.datatorrent.contrib.hdht.HDHTWalManager.PreviousWALDetails; import com.datatorrent.contrib.hdht.HDHTWalManager.WalPosition; import com.datatorrent.lib.fileaccess.FileAccess; import com.datatorrent.lib.fileaccess.FileAccess.FileReader; import com.datatorrent.lib.fileaccess.FileAccess.FileWriter; import com.datatorrent.lib.fileaccess.FileAccessFSImpl; import com.datatorrent.lib.fileaccess.TFileImpl; import com.datatorrent.netlet.util.Slice; /** * Writes data to buckets. Can be sub-classed as operator or used in composite pattern. * <p> * Changes are accumulated in a write cache and written to a write-ahead-log (WAL). They are then asynchronously flushed * to the data files when thresholds for the memory buffer are reached. Changes are flushed to data files at the * committed window boundary. * <p> * When data is read through the same operator (extends reader), full consistency is guaranteed (reads will consider * changes that are not flushed). In the event of failure, the operator recovers the write buffer from the WAL. * * @displayName HDHT Writer * @category Output * @tags hdht, output operator * * @since 2.0.0 */ public class HDHTWriter extends HDHTReader implements CheckpointListener, Operator, HDHT.Writer { private FileAccess walStore; private final String WAL_FILES_LOCATION = "/WAL/"; private final transient HashMap<Long, BucketMeta> metaCache = Maps.newHashMap(); private long currentWindowId; private final transient HashMap<Long, Bucket> buckets = Maps.newHashMap(); // After buckets are reorganized on dynamic repartitioning, this map maintains which Wal to look up during recovery for each bucketKey public Map<Long, PreviousWALDetails> parentWalMetaDataMap = Maps.newHashMap(); public HashMap<Long, HDHTWalManager.WalPosition> walPositions = Maps.newLinkedHashMap(); public HDHTWalManager.WalPosition committedWalPosition; public List<PreviousWALDetails> parentWals = new LinkedList<>(); public Set<PreviousWALDetails> alreadyCopiedWals = new HashSet<>(); @VisibleForTesting protected transient ExecutorService writeExecutor; private transient volatile Throwable writerError; protected Set<Long> bucketKeys = Sets.newHashSet(); protected WalPosition minimumRecoveryWalPosition = new WalPosition(0, 0); private int maxFileSize = 128 * 1024 * 1024; // 128m private int maxWalFileSize = 64 * 1024 * 1024; private int flushSize = 1000000; private int flushIntervalCount = 120; private transient OperatorContext context; static final byte[] DELETED = {}; private transient HDHTWalManager wal; protected WalMeta singleWalMeta = new WalMeta(); private long walKey = 0; private transient boolean bucketsRecovered = false; /** * Size limit for data files. Files are rolled once the limit has been exceeded. The final size of a file can be * larger than the limit by the size of the last/single entry written to it. * * @return The size limit for data files. */ public int getMaxFileSize() { return maxFileSize; } public void setMaxFileSize(int maxFileSize) { this.maxFileSize = maxFileSize; } /** * Size limit for WAL files. Files are rolled once the limit has been exceeded. The final size of a file can be larger * than the limit, as files are rolled at end of the operator window. * * @return The size limit for WAL files. */ public int getMaxWalFileSize() { return maxWalFileSize; } public void setMaxWalFileSize(int maxWalFileSize) { this.maxWalFileSize = maxWalFileSize; if (this.wal != null) { this.wal.setMaxWalFileSize(maxWalFileSize); } } /** * The number of changes collected in memory before flushing to persistent storage. * * @return The number of changes collected in memory before flushing to persistent storage. */ public int getFlushSize() { return flushSize; } public void setFlushSize(int flushSize) { this.flushSize = flushSize; } /** * Cached writes are flushed to persistent storage periodically. The interval is specified as count of windows and * establishes the maximum latency for changes to be written while below the {@link #flushSize} threshold. * * @return The flush interval count. */ @Min(value = 1) public int getFlushIntervalCount() { return flushIntervalCount; } public void setFlushIntervalCount(int flushIntervalCount) { this.flushIntervalCount = flushIntervalCount; } /** * Write data to size based rolling files * * @param bucket * @param bucketMeta * @param data * @throws IOException */ private void writeFile(Bucket bucket, BucketMeta bucketMeta, TreeMap<Slice, Slice> data) throws IOException { BucketIOStats ioStats = getOrCretaStats(bucket.bucketKey); long startTime = System.currentTimeMillis(); FileWriter fw = null; BucketFileMeta fileMeta = null; int keysWritten = 0; for (Map.Entry<Slice, Slice> dataEntry : data.entrySet()) { if (fw == null) { // next file fileMeta = bucketMeta.addFile(bucket.bucketKey, dataEntry.getKey()); LOG.debug("writing data file {} {}", bucket.bucketKey, fileMeta.name); fw = this.store.getWriter(bucket.bucketKey, fileMeta.name + ".tmp"); keysWritten = 0; } if (Arrays.equals(dataEntry.getValue().toByteArray(), DELETED)) { continue; } fw.append(dataEntry.getKey().toByteArray(), dataEntry.getValue().toByteArray()); keysWritten++; if (fw.getBytesWritten() > this.maxFileSize) { ioStats.dataFilesWritten++; ioStats.filesWroteInCurrentWriteCycle++; // roll file fw.close(); ioStats.dataBytesWritten += fw.getBytesWritten(); this.store.rename(bucket.bucketKey, fileMeta.name + ".tmp", fileMeta.name); LOG.debug("created data file {} {} with {} entries", bucket.bucketKey, fileMeta.name, keysWritten); fw = null; keysWritten = 0; } } if (fw != null) { ioStats.dataFilesWritten++; ioStats.filesWroteInCurrentWriteCycle++; fw.close(); ioStats.dataBytesWritten += fw.getBytesWritten(); this.store.rename(bucket.bucketKey, fileMeta.name + ".tmp", fileMeta.name); LOG.debug("created data file {} {} with {} entries", bucket.bucketKey, fileMeta.name, keysWritten); } ioStats.dataWriteTime += System.currentTimeMillis() - startTime; } private Bucket getBucket(long bucketKey) throws IOException { Bucket bucket = this.buckets.get(bucketKey); bucketKeys.add(bucketKey); if (bucket == null) { LOG.debug("Opening bucket {}", bucketKey); bucket = new Bucket(keyComparator); bucket.bucketKey = bucketKey; this.buckets.put(bucketKey, bucket); BucketMeta bmeta = getMeta(bucketKey); WalMeta wmeta = getWalMeta(bucketKey); LOG.info("walStart {} walEnd {} windowId {} committedWid {} currentWid {}", bmeta.recoveryStartWalPosition, wmeta.cpWalPosition, wmeta.windowId, bmeta.committedWid, currentWindowId); BucketIOStats ioStats = getOrCretaStats(bucketKey); if (ioStats != null) { this.wal.restoreStats(ioStats); } // bmeta.componentLSN is data which is committed to disks. // wmeta.windowId windowId till which data is available in WAL. if (!bucketsRecovered && bmeta.committedWid < wmeta.windowId && wmeta.windowId != 0) { LOG.debug("Recovery for buckets {}", bucketKeys); // Add tuples from recovery start till recovery end. Map<Long, WriteCache> bucketMap = Maps.newHashMap(); for (Long bucketKeyEntry : bucketKeys) { Bucket bucketEntry = buckets.get(bucketKeyEntry); if (bucketEntry == null) { bucketEntry = new Bucket(keyComparator); bucketEntry.bucketKey = bucketKeyEntry; this.buckets.put(bucketKeyEntry, bucketEntry); } bucketMap.put(bucketKeyEntry, bucketEntry.committedWriteCache); } this.wal.runRecovery(new HDHTWalManager.RecoveryContext(bucketMap, keyComparator, minimumRecoveryWalPosition, wmeta.cpWalPosition)); // After recovery data from WAL is added to committedCache, update location of WAL till data present in // committed cache. this.committedWalPosition = wmeta.cpWalPosition; this.walPositions.put(wmeta.windowId, wmeta.cpWalPosition); bucketsRecovered = true; } } return bucket; } /** * Lookup in write cache (data not flushed/committed to files). * @param bucketKey * @param key * @return uncommitted. */ @Override public byte[] getUncommitted(long bucketKey, Slice key) { Bucket bucket = this.buckets.get(bucketKey); if (bucket != null) { byte[] v = bucket.writeCache.get(key); if (v != null) { return v != DELETED ? v : null; } for (Map.Entry<Long, WriteCache> entry : bucket.checkpointedWriteCache.entrySet()) { byte[] v2 = entry.getValue().get(key); // find most recent entry if (v2 != null) { v = v2; } } if (v != null) { return v != DELETED ? v : null; } v = bucket.committedWriteCache.get(key); if (v != null) { return v != DELETED ? v : null; } v = bucket.frozenWriteCache.get(key); return v != null && v != DELETED ? v : null; } return null; } /** * Intercept query processing to incorporate unwritten changes. */ @Override protected void processQuery(HDSQuery query) { // check unwritten changes first byte[] v = getUncommitted(query.bucketKey, query.key); if (v != null) { query.result = v; query.processed = true; return; } super.processQuery(query); } @Override public void put(long bucketKey, Slice key, byte[] value) throws IOException { Bucket bucket = getBucket(bucketKey); this.wal.append(bucketKey, key, value); bucket.writeCache.put(key, value); updateQueryResultCache(bucketKey, key, value); } private void updateQueryResultCache(long bucketKey, Slice key, byte[] value) { HDSQuery q = queries.get(key); if (q != null) { q.processed = true; q.result = value; } } public void delete(long bucketKey, Slice key) throws IOException { put(bucketKey, key, DELETED); } /** * Process purge operation performed. * Go over each file in the bucket, see if any purge operation affects the keys * present in the file. If any purge range overlaps with the file key range, then data needs * to be written again by removing keys in purge range. * * @param bucket bucket on which purge operations were performed. * @param bmeta metadata for the bucket. * @param filesToDelete deleted files are added to this set. * @return new bucket meta copy after processing of purge operations. * @throws IOException */ private BucketMeta processPurge(Bucket bucket, BucketMeta bmeta, HashSet<String> filesToDelete) throws IOException { /* Nothing to do if no files are written */ if (bmeta.files.isEmpty()) { LOG.debug("No existing files to purge data from bucket {}", bucket); return bmeta; } /* no purge request pending */ WriteCache frozen = bucket.frozenWriteCache; if (frozen.getPurges() == null || frozen.getPurges().isEmpty()) { LOG.debug("No Pending purge requests for bucket {}", bucket); return bmeta; } // make a copy, because as files are deleted in writeFileWithPurge, the traversal // of loop below will fail with concurrent modification exception. BucketMeta bucketMetaCopy = kryo.copy(getMeta(bucket.bucketKey)); Iterator<BucketFileMeta> fileIter = bmeta.files.values().iterator(); Slice last = frozen.getPurges().getLast(); while (fileIter.hasNext()) { BucketFileMeta fmeta = fileIter.next(); /* If this file falls out of the last purge end value, then break as next files will be outside of purge range too. */ if (keyComparator.compare(fmeta.startKey, last) > 0) { break; } Range<Slice> frange = new Range<>(fmeta.startKey, getEndKey(bucket.bucketKey, fmeta, frozen.getPurges())); RangeSet<Slice> rset = frozen.getPurges().getOverlappingRanges(frange); if (rset.isEmpty()) { continue; } writeFileWithPurge(bucket, fmeta, rset, filesToDelete, bucketMetaCopy); } return bucketMetaCopy; } /** * Write out the changes to the file because of purge operation. If any purge range completely * covers the keys in the file then delete the file without even opening and reading the keys * from the file. * * remove keys overlapping with purge ranges and they write out to a new file. * @param bucket bucket * @param meta file meta data. * @param rset purge range set which overlaps with the file. * @param filesToDelete if file is being deleted completely then add it to this list. * @param bmeta bucket metadata. * @throws IOException */ private void writeFileWithPurge(Bucket bucket, BucketFileMeta meta, RangeSet<Slice> rset, HashSet<String> filesToDelete, BucketMeta bmeta) throws IOException { LOG.debug("Writing file because of purge operation {}", meta); if (rset.containsFully(new Range<>(meta.startKey, getEndKey(bucket.bucketKey, meta, rset)))) { LOG.info("File being deleted because of purge {}", meta); filesToDelete.add(meta.name); bmeta.files.remove(meta.startKey); return; } TreeMap<Slice, Slice> fileData = readDataExcludingPurge(bucket, meta, rset); /** Rewrite the file, if any key from file is removed as part of purge, * and there is some data to be written. */ if (fileData.size() > 0) { LOG.info("Rewriting file because of purge {}", meta); filesToDelete.add(meta.name); bmeta.files.remove(meta.startKey); writeFile(bucket, bmeta, fileData); } } /** * Read all data from file, excluding the data which is masked by the purge ranges (rset). * This function use seek to seek at the end of current purge range to avoid reading keys * which are present in the purge range. * * @param bucket Bucket * @param meta metadata about file. * @param rset set of purge ranges. * @return data as a map. * @throws IOException */ private TreeMap<Slice, Slice> readDataExcludingPurge(Bucket bucket, BucketFileMeta meta, RangeSet<Slice> rset) throws IOException { FileReader reader = store.getReader(bucket.bucketKey, meta.name); TreeMap<Slice, Slice> fileData = new TreeMap<>(keyComparator); /* Check if there is data in initial part of file before next purge range */ Slice key = new Slice(null, 0, 0); Slice value = new Slice(null, 0, 0); boolean valid = reader.next(key, value); for (Range<Slice> range : rset) { while (keyComparator.compare(key, range.start) < 0 && valid) { fileData.put(new Slice(key.buffer, key.offset, key.length), new Slice(value.buffer)); valid = reader.next(key, value); } /* need to check valid at every stage, because next wraps around the file * and starts reading from start of the file. */ valid = reader.seek(range.end); if (!valid) { break; } valid = reader.next(key, value); // this will read end key, we want to exclude this key. if (!valid) { break; } valid = reader.next(key, value); // go past the end key. if (!valid) { break; } } while (valid) { fileData.put(new Slice(key.buffer, key.offset, key.length), new Slice(value.buffer)); valid = reader.next(key, value); } return fileData; } /** * Flush changes from write cache to disk. New data files will be written and meta data replaced atomically. The flush * frequency determines availability of changes to external readers. * * @throws IOException */ private void writeDataFiles(Bucket bucket) throws IOException { BucketIOStats ioStats = getOrCretaStats(bucket.bucketKey); LOG.debug("Writing data files in bucket {}", bucket.bucketKey); // copy meta data on write BucketMeta bucketMetaCopy = kryo.copy(getMeta(bucket.bucketKey)); /** Process purge requests before flushing data from cache to maintain * the oder or purge and put operations. This makes sure that purged data * removed from file, before new data is added to the files */ HashSet<String> filesToDelete = Sets.newHashSet(); bucketMetaCopy = processPurge(bucket, bucketMetaCopy, filesToDelete); // bucket keys by file TreeMap<Slice, BucketFileMeta> bucketSeqStarts = bucketMetaCopy.files; Map<BucketFileMeta, Map<Slice, Slice>> modifiedFiles = Maps.newHashMap(); for (Map.Entry<Slice, byte[]> entry : bucket.frozenWriteCache.entrySet()) { // find file for key Map.Entry<Slice, BucketFileMeta> floorEntry = bucketSeqStarts.floorEntry(entry.getKey()); BucketFileMeta floorFile; if (floorEntry != null) { floorFile = floorEntry.getValue(); } else { floorEntry = bucketSeqStarts.firstEntry(); if (floorEntry == null || floorEntry.getValue().name != null) { // no existing file or file with higher key floorFile = new BucketFileMeta(); } else { // placeholder for new keys, move start key floorFile = floorEntry.getValue(); bucketSeqStarts.remove(floorEntry.getKey()); } floorFile.startKey = entry.getKey(); if (floorFile.startKey.length != floorFile.startKey.buffer.length) { // normalize key for serialization floorFile.startKey = new Slice(floorFile.startKey.toByteArray()); } bucketSeqStarts.put(floorFile.startKey, floorFile); } Map<Slice, Slice> fileUpdates = modifiedFiles.get(floorFile); if (fileUpdates == null) { modifiedFiles.put(floorFile, fileUpdates = Maps.newHashMap()); } fileUpdates.put(entry.getKey(), new Slice(entry.getValue())); } // write modified files for (Map.Entry<BucketFileMeta, Map<Slice, Slice>> fileEntry : modifiedFiles.entrySet()) { BucketFileMeta fileMeta = fileEntry.getKey(); TreeMap<Slice, Slice> fileData = new TreeMap<Slice, Slice>(getKeyComparator()); if (fileMeta.name != null) { // load existing file long start = System.currentTimeMillis(); FileReader reader = store.getReader(bucket.bucketKey, fileMeta.name); reader.readFully(fileData); ioStats.dataBytesRead += store.getFileSize(bucket.bucketKey, fileMeta.name); ioStats.dataReadTime += System.currentTimeMillis() - start; /* these keys are re-written */ ioStats.dataKeysRewritten += fileData.size(); ioStats.filesReadInCurrentWriteCycle++; ioStats.dataFilesRead++; reader.close(); filesToDelete.add(fileMeta.name); } // apply updates fileData.putAll(fileEntry.getValue()); // new file writeFile(bucket, bucketMetaCopy, fileData); } LOG.debug("Files written {} files read {}", ioStats.filesWroteInCurrentWriteCycle, ioStats.filesReadInCurrentWriteCycle); // flush meta data for new files try { LOG.debug("Writing {} with {} file entries", FNAME_META, bucketMetaCopy.files.size()); OutputStream os = store.getOutputStream(bucket.bucketKey, FNAME_META + ".new"); Output output = new Output(os); bucketMetaCopy.committedWid = bucket.committedLSN; bucketMetaCopy.recoveryStartWalPosition = bucket.recoveryStartWalPosition; kryo.writeClassAndObject(output, bucketMetaCopy); output.close(); os.close(); store.rename(bucket.bucketKey, FNAME_META + ".new", FNAME_META); } catch (IOException e) { throw new RuntimeException("Failed to write bucket meta data " + bucket.bucketKey, e); } // clear pending changes ioStats.dataKeysWritten += bucket.frozenWriteCache.size(); // switch to new version this.metaCache.put(bucket.bucketKey, bucketMetaCopy); // delete old files for (String fileName : filesToDelete) { store.delete(bucket.bucketKey, fileName); } invalidateReader(bucket.bucketKey, filesToDelete); // clearing cache after invalidating readers bucket.frozenWriteCache.clear(); // cleanup WAL files which are not needed anymore. minimumRecoveryWalPosition = bucketMetaCopy.recoveryStartWalPosition; for (Long bucketId : this.bucketKeys) { BucketMeta meta = getMeta(bucketId); if (meta.recoveryStartWalPosition.fileId < minimumRecoveryWalPosition.fileId || (meta.recoveryStartWalPosition.fileId == minimumRecoveryWalPosition.fileId && meta.recoveryStartWalPosition.offset < minimumRecoveryWalPosition.offset)) { minimumRecoveryWalPosition = meta.recoveryStartWalPosition; } } this.wal.cleanup(minimumRecoveryWalPosition.fileId); ioStats.filesReadInCurrentWriteCycle = 0; ioStats.filesWroteInCurrentWriteCycle = 0; } @Override public void setup(OperatorContext context) { super.setup(context); if (context != null) { setWalKey(context.getId()); } writeExecutor = Executors.newSingleThreadScheduledExecutor( new NameableThreadFactory(this.getClass().getSimpleName() + "-Writer")); this.context = context; if (this.walStore == null) { // if WAL location is not specified, by default it would be placed under <HDHT Location>/WAL/ if HDHT location is known // Otherwise default location is under <Application Path>/WAL/ this.walStore = new TFileImpl.DTFileImpl(); if (this.store instanceof FileAccessFSImpl) { ((FileAccessFSImpl) this.walStore) .setBasePath(((FileAccessFSImpl) this.store).getBasePath() + WAL_FILES_LOCATION); } else { ((FileAccessFSImpl) this.walStore) .setBasePath(context.getValue(DAG.APPLICATION_PATH) + WAL_FILES_LOCATION); } } this.walStore.init(); if (!this.parentWals.isEmpty()) { if (this.parentWals.size() == 1) { PreviousWALDetails parentWal = parentWals.iterator().next(); this.singleWalMeta = new WalMeta(parentWal.getWindowId(), parentWal.getEndPosition()); } else { this.singleWalMeta.cpWalPosition = new WalPosition(0, 0); } } this.wal = new HDHTWalManager(this.walStore, getWalKey(), this.singleWalMeta.cpWalPosition); this.wal.setMaxWalFileSize(maxWalFileSize); if (!this.parentWals.isEmpty()) { resetBucketMeta(); if (this.parentWals.size() == 1) { // Copy the WAL files as is from parent WAL this.walPositions = parentWals.iterator().next().walPositions; this.wal.copyPreviousWalFiles(parentWals, alreadyCopiedWals); alreadyCopiedWals.addAll(parentWals); parentWals.clear(); } else { mergeParentWalFilesByWindow(); } } } @Override public void teardown() { IOUtils.closeQuietly(this.wal); writeExecutor.shutdown(); IOUtils.closeQuietly(this.walStore); super.teardown(); } @Override public void beginWindow(long windowId) { super.beginWindow(windowId); this.currentWindowId = windowId; } @Override public void endWindow() { super.endWindow(); try { if (this.wal != null) { this.wal.endWindow(currentWindowId); singleWalMeta.cpWalPosition = this.wal.getCurrentPosition(); singleWalMeta.windowId = currentWindowId; } } catch (IOException e) { throw new RuntimeException("Failed to flush WAL", e); } // propagate writer exceptions if (writerError != null) { throw new RuntimeException("Error while flushing write cache.", this.writerError); } if (context != null) { updateStats(); context.setCounters(bucketStats); } } private WalMeta getWalMeta(long bucketKey) { return singleWalMeta; } @Override public void checkpointed(long windowId) { for (final Bucket bucket : this.buckets.values()) { if (!bucket.writeCache.isEmpty()) { bucket.checkpointedWriteCache.put(windowId, bucket.writeCache); bucket.writeCache = new WriteCache(new DefaultKeyComparator()); } } this.walPositions.put(windowId, this.wal.getCurrentPosition()); } /** * Get meta data from cache or load it on first access * * @param bucketKey * @return The bucket meta. */ private BucketMeta getMeta(long bucketKey) { BucketMeta bm = metaCache.get(bucketKey); if (bm == null) { bm = loadBucketMeta(bucketKey); metaCache.put(bucketKey, bm); } return bm; } private void resetBucketMeta() { for (Long bucketKey : bucketKeys) { metaCache.put(bucketKey, new BucketMeta(keyComparator)); } } @Override public void committed(long committedWindowId) { // Remove stale parent files if not already removed if (!alreadyCopiedWals.isEmpty()) { this.wal.deletePreviousWalFiles(alreadyCopiedWals); alreadyCopiedWals.clear(); } for (final Bucket bucket : this.buckets.values()) { for (Iterator<Map.Entry<Long, WriteCache>> cpIter = bucket.checkpointedWriteCache.entrySet() .iterator(); cpIter.hasNext();) { Map.Entry<Long, WriteCache> checkpointEntry = cpIter.next(); if (checkpointEntry.getKey() <= committedWindowId) { bucket.committedWriteCache.merge(checkpointEntry.getValue()); cpIter.remove(); } } for (Iterator<Map.Entry<Long, HDHTWalManager.WalPosition>> wpIter = this.walPositions.entrySet() .iterator(); wpIter.hasNext();) { Map.Entry<Long, HDHTWalManager.WalPosition> entry = wpIter.next(); if (entry.getKey() <= committedWindowId) { this.committedWalPosition = entry.getValue(); wpIter.remove(); } } if ((bucket.committedWriteCache.size() > this.flushSize || currentWindowId - bucket.lastFlushWindowId > flushIntervalCount) && !bucket.committedWriteCache.isEmpty()) { // ensure previous flush completed if (bucket.frozenWriteCache.isEmpty()) { bucket.frozenWriteCache = bucket.committedWriteCache; bucket.committedWriteCache = new WriteCache(keyComparator); bucket.recoveryStartWalPosition = this.committedWalPosition; bucket.committedLSN = committedWindowId; LOG.debug("Flushing data for bucket {} committedWid {} recoveryStartWalPosition {}", bucket.bucketKey, bucket.committedLSN, bucket.recoveryStartWalPosition); Runnable flushRunnable = new Runnable() { @Override public void run() { try { writeDataFiles(bucket); } catch (Throwable e) { LOG.debug("Write error: {}", e.getMessage()); writerError = e; } } }; this.writeExecutor.execute(flushRunnable); bucket.lastFlushWindowId = committedWindowId; } } } // propagate writer exceptions if (writerError != null) { throw new RuntimeException("Error while flushing write cache.", this.writerError); } } private static class Bucket { private long lastFlushWindowId; private long bucketKey; // keys that were modified and written to WAL, but not yet persisted, by checkpoint private WriteCache writeCache = new WriteCache(new DefaultKeyComparator()); private final LinkedHashMap<Long, WriteCache> checkpointedWriteCache = Maps.newLinkedHashMap(); private WriteCache committedWriteCache = new WriteCache(new DefaultKeyComparator()); // keys that are being flushed to data files private WriteCache frozenWriteCache = new WriteCache(new DefaultKeyComparator()); private long committedLSN; public HDHTWalManager.WalPosition recoveryStartWalPosition; public Bucket(Comparator<Slice> cmp) { writeCache = new WriteCache(cmp); committedWriteCache = new WriteCache(cmp); frozenWriteCache = new WriteCache(cmp); } } @VisibleForTesting protected void forceWal() throws IOException { this.wal.close(); } @VisibleForTesting protected int unflushedDataSize(long bucketKey) throws IOException { Bucket b = getBucket(bucketKey); return b.writeCache.size(); } @VisibleForTesting protected int committedDataSize(long bucketKey) throws IOException { Bucket b = getBucket(bucketKey); return b.committedWriteCache.size(); } private static final Logger LOG = LoggerFactory.getLogger(HDHTWriter.class); /* Holds current file Id for WAL and current recoveryEndWalOffset for WAL */ static class WalMeta { /* The current WAL file and recoveryEndWalOffset */ // Window Id which is written to the WAL. public long windowId; // Checkpointed WAL position. HDHTWalManager.WalPosition cpWalPosition; public WalMeta(long windowId, WalPosition walPosition) { this.windowId = windowId; this.cpWalPosition = walPosition; } public WalMeta() { } } @JsonSerialize public static class BucketIOStats implements Serializable { private static final long serialVersionUID = 201412091454L; /* Bytes written to the WAL till now */ public long walBytesWritten; /* Number of times WAL was flushed */ public long walFlushCount; /* Amount of time spent while waiting for WAL flush to disk in milliseconds */ public long walFlushTime; /* wal keys written */ public long walKeysWritten; /* Number of data files written */ public long dataFilesWritten; /* Number of bytes written to data files */ public long dataBytesWritten; /* Time taken for writing files */ public long dataWriteTime; /* total keys written to data files */ public long dataKeysWritten; /* The number of keys which are re-written, i.e keys which are read into memory from existing files which are written again in new data files */ public long dataKeysRewritten; /* records in memory */ public long dataInWriteCache; public long dataInFrozenCache; public int filesReadInCurrentWriteCycle; public int filesWroteInCurrentWriteCycle; public int dataFilesRead; /* Total time spent in reading files during write */ public long dataReadTime; /* Number of bytes read during data read */ public long dataBytesRead; @Override public String toString() { return "BucketIOStats{" + "walBytesWritten=" + walBytesWritten + ", walFlushCount=" + walFlushCount + ", walFlushTime=" + walFlushTime + ", walKeysWritten=" + walKeysWritten + ", dataFilesWritten=" + dataFilesWritten + ", dataBytesWritten=" + dataBytesWritten + ", dataWriteTime=" + dataWriteTime + ", dataKeysWritten=" + dataKeysWritten + ", dataKeysRewritten=" + dataKeysRewritten + ", dataInWriteCache=" + dataInWriteCache + ", dataInFrozenCache=" + dataInFrozenCache + ", filesReadInCurrentWriteCycle=" + filesReadInCurrentWriteCycle + ", filesWroteInCurrentWriteCycle=" + filesWroteInCurrentWriteCycle + ", dataFilesRead=" + dataFilesRead + ", dataReadTime=" + dataReadTime + ", dataBytesRead=" + dataBytesRead + '}'; } } private void updateStats() { for (Bucket bucket : buckets.values()) { BucketIOStats ioStats = getOrCretaStats(bucket.bucketKey); /* fill in stats for WAL */ HDHTWalManager.WalStats walStats = this.wal.getCounters(); ioStats.walBytesWritten = walStats.totalBytes; ioStats.walFlushCount = walStats.flushCounts; ioStats.walFlushTime = walStats.flushDuration; ioStats.walKeysWritten = walStats.totalKeys; ioStats.dataInWriteCache = bucket.writeCache.size(); ioStats.dataInFrozenCache = bucket.frozenWriteCache.size(); } } @JsonSerialize public static class AggregatedBucketIOStats implements Serializable { private static final long serialVersionUID = 201412091454L; public BucketIOStats globalStats = new BucketIOStats(); /* Individual bucket stats */ public Map<Long, BucketIOStats> aggregatedStats = Maps.newHashMap(); } public static class BucketIOStatAggregator implements Serializable, Context.CountersAggregator { private static final long serialVersionUID = 201412091454L; @Override public Object aggregate(Collection<?> countersList) { AggregatedBucketIOStats aggStats = new AggregatedBucketIOStats(); for (Object o : countersList) { @SuppressWarnings("unchecked") Map<Long, BucketIOStats> statMap = (Map<Long, BucketIOStats>) o; for (Long bId : statMap.keySet()) { BucketIOStats stats = statMap.get(bId); aggStats.globalStats.walBytesWritten += stats.walBytesWritten; aggStats.globalStats.walFlushCount += stats.walFlushCount; aggStats.globalStats.walFlushTime += stats.walFlushTime; aggStats.globalStats.walKeysWritten += stats.walKeysWritten; aggStats.globalStats.dataWriteTime += stats.dataWriteTime; aggStats.globalStats.dataFilesWritten += stats.dataFilesWritten; aggStats.globalStats.dataBytesWritten += stats.dataBytesWritten; aggStats.globalStats.dataKeysWritten += stats.dataKeysWritten; aggStats.globalStats.dataKeysRewritten += stats.dataKeysRewritten; aggStats.globalStats.dataInWriteCache += stats.dataInWriteCache; aggStats.globalStats.dataInFrozenCache += stats.dataInFrozenCache; aggStats.globalStats.filesReadInCurrentWriteCycle += stats.filesReadInCurrentWriteCycle; aggStats.globalStats.filesWroteInCurrentWriteCycle += stats.filesWroteInCurrentWriteCycle; aggStats.globalStats.dataReadTime += stats.dataReadTime; aggStats.globalStats.dataFilesRead += stats.dataFilesRead; aggStats.globalStats.dataBytesRead += stats.dataBytesRead; aggStats.aggregatedStats.put(bId, stats); } } return aggStats; } } /* A map holding stats for each bucket written by this partition */ private final HashMap<Long, BucketIOStats> bucketStats = Maps.newHashMap(); private BucketIOStats getOrCretaStats(long bucketKey) { BucketIOStats ioStats = bucketStats.get(bucketKey); if (ioStats == null) { ioStats = new BucketIOStats(); bucketStats.put(bucketKey, ioStats); } return ioStats; } @Override public void purge(long bucketKey, Slice start, Slice end) throws IOException { Bucket bucket = getBucket(bucketKey); this.wal.append(new HDHTLogEntry.PurgeEntry(bucketKey, start, end)); bucket.writeCache.purge(start, end); } /** * Returns approximate end key of the file. * * To avoid reading whole file this function returns start key of the next file. * This information is available from metadata and does not require any disk I/O. * * For the last file in the list, If any purge ranges are provided, then go over each range * in range set and check if we could seek to endKey successfully, * successful seek means that the file contain key greater than range's endKey. * In this case try the next range. This approach works as range set is sorted by the start and end keys, * and it does not contain any overlapping ranges (overlapping ranges are already taken care during * merge of the ranges). * * If file contains data beyond last range provided, then we have no option other than * to read the file and return the last key. We can not generate the next key from last * range's endKey and we do not have knowledge about the comparator * */ protected Slice getEndKey(long bucketKey, BucketFileMeta fmeta, RangeSet<Slice> rset) throws IOException { BucketMeta bmeta = getMeta(bucketKey); Map.Entry<Slice, BucketFileMeta> entry = bmeta.files.higherEntry(fmeta.startKey); if (entry != null) { return entry.getKey(); } boolean valid = true; FileReader reader = store.getReader(bucketKey, fmeta.name); if (rset != null) { for (Range<Slice> range : rset) { valid = reader.seek(range.end); if (!valid) { return range.end; } } } Slice key = new Slice(null, 0, 0); Slice value = new Slice(null, 0, 0); while (valid) { valid = reader.next(key, value); } return key; } public FileAccess getWalStore() { return walStore; } public void setWalStore(FileAccess walStore) { this.walStore = walStore; } /** * Merge multiple parent WAL files into single WAL file by copying parent WAL entries ordered by Window Ids */ public void mergeParentWalFilesByWindow() { try { // Copy all the WAL file contents which are committed and removed from checkpointed window Position states // Order of window ID need not be maintained for already committed WAL. So copy by appending. for (PreviousWALDetails parentWal : parentWals) { if (parentWal.getCommittedWalPosition() != null) { this.wal.copyWALFiles(parentWal.getStartPosition(), parentWal.getCommittedWalPosition(), parentWal.getWalKey()); } } for (Long bucketKey : bucketKeys) { BucketMeta meta = metaCache.get(bucketKey); meta.recoveryStartWalPosition = this.wal.getCurrentPosition(); } // Copy remaining checkpointed window positions ordered by windowId this.wal.mergeWalFiles(parentWals, walPositions); this.forceWal(); this.wal.writer = null; alreadyCopiedWals.addAll(parentWals); parentWals.clear(); singleWalMeta.cpWalPosition = this.wal.getCurrentPosition(); // Reset WAL recovery position to beginning of WAL files minimumRecoveryWalPosition = new WalPosition(0, 0); } catch (Exception e) { throw Throwables.propagate(e); } } public long getWalKey() { return walKey; } public void setWalKey(long walKey) { this.walKey = walKey; } }