Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.druid.segment.realtime.appenderator; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.base.Supplier; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.primitives.Ints; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import org.apache.commons.io.FileUtils; import org.apache.druid.client.cache.Cache; import org.apache.druid.client.cache.CacheConfig; import org.apache.druid.client.cache.CachePopulatorStats; import org.apache.druid.common.guava.ThreadRenamingCallable; import org.apache.druid.data.input.Committer; import org.apache.druid.data.input.InputRow; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Pair; import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.RetryUtils; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.common.io.Closer; import org.apache.druid.java.util.emitter.EmittingLogger; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.query.Query; import org.apache.druid.query.QueryRunner; import org.apache.druid.query.QueryRunnerFactoryConglomerate; import org.apache.druid.query.QuerySegmentWalker; import org.apache.druid.query.SegmentDescriptor; import org.apache.druid.segment.IndexIO; import org.apache.druid.segment.IndexMerger; import org.apache.druid.segment.IndexSpec; import org.apache.druid.segment.QueryableIndex; import org.apache.druid.segment.QueryableIndexSegment; import org.apache.druid.segment.Segment; import org.apache.druid.segment.incremental.IncrementalIndexAddResult; import org.apache.druid.segment.incremental.IndexSizeExceededException; import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.indexing.TuningConfigs; import org.apache.druid.segment.loading.DataSegmentPusher; import org.apache.druid.segment.realtime.FireDepartmentMetrics; import org.apache.druid.segment.realtime.FireHydrant; import org.apache.druid.segment.realtime.plumber.Sink; import org.apache.druid.server.coordination.DataSegmentAnnouncer; import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.VersionedIntervalTimeline; import org.joda.time.Interval; import javax.annotation.Nullable; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; /** */ public class AppenderatorImpl implements Appenderator { private static final EmittingLogger log = new EmittingLogger(AppenderatorImpl.class); private static final int WARN_DELAY = 1000; private static final String IDENTIFIER_FILE_NAME = "identifier.json"; private final DataSchema schema; private final AppenderatorConfig tuningConfig; private final FireDepartmentMetrics metrics; private final DataSegmentPusher dataSegmentPusher; private final ObjectMapper objectMapper; private final DataSegmentAnnouncer segmentAnnouncer; private final IndexIO indexIO; private final IndexMerger indexMerger; private final Cache cache; private final Map<SegmentIdentifier, Sink> sinks = new ConcurrentHashMap<>(); private final Set<SegmentIdentifier> droppingSinks = Sets.newConcurrentHashSet(); private final VersionedIntervalTimeline<String, Sink> sinkTimeline = new VersionedIntervalTimeline<>( String.CASE_INSENSITIVE_ORDER); private final long maxBytesTuningConfig; private final QuerySegmentWalker texasRanger; // This variable updated in add(), persist(), and drop() private final AtomicInteger rowsCurrentlyInMemory = new AtomicInteger(); private final AtomicInteger totalRows = new AtomicInteger(); private final AtomicLong bytesCurrentlyInMemory = new AtomicLong(); // Synchronize persisting commitMetadata so that multiple persist threads (if present) // and abandon threads do not step over each other private final Lock commitLock = new ReentrantLock(); private final AtomicBoolean closed = new AtomicBoolean(false); private volatile ListeningExecutorService persistExecutor = null; private volatile ListeningExecutorService pushExecutor = null; // use intermediate executor so that deadlock conditions can be prevented // where persist and push Executor try to put tasks in each other queues // thus creating circular dependency private volatile ListeningExecutorService intermediateTempExecutor = null; private volatile long nextFlush; private volatile FileLock basePersistDirLock = null; private volatile FileChannel basePersistDirLockChannel = null; private volatile Throwable persistError; AppenderatorImpl(DataSchema schema, AppenderatorConfig tuningConfig, FireDepartmentMetrics metrics, DataSegmentPusher dataSegmentPusher, ObjectMapper objectMapper, QueryRunnerFactoryConglomerate conglomerate, DataSegmentAnnouncer segmentAnnouncer, ServiceEmitter emitter, ExecutorService queryExecutorService, IndexIO indexIO, IndexMerger indexMerger, Cache cache, CacheConfig cacheConfig, CachePopulatorStats cachePopulatorStats) { this.schema = Preconditions.checkNotNull(schema, "schema"); this.tuningConfig = Preconditions.checkNotNull(tuningConfig, "tuningConfig"); this.metrics = Preconditions.checkNotNull(metrics, "metrics"); this.dataSegmentPusher = Preconditions.checkNotNull(dataSegmentPusher, "dataSegmentPusher"); this.objectMapper = Preconditions.checkNotNull(objectMapper, "objectMapper"); this.segmentAnnouncer = Preconditions.checkNotNull(segmentAnnouncer, "segmentAnnouncer"); this.indexIO = Preconditions.checkNotNull(indexIO, "indexIO"); this.indexMerger = Preconditions.checkNotNull(indexMerger, "indexMerger"); this.cache = cache; this.texasRanger = conglomerate == null ? null : new SinkQuerySegmentWalker(schema.getDataSource(), sinkTimeline, objectMapper, emitter, conglomerate, queryExecutorService, Preconditions.checkNotNull(cache, "cache"), cacheConfig, cachePopulatorStats); maxBytesTuningConfig = TuningConfigs.getMaxBytesInMemoryOrDefault(tuningConfig.getMaxBytesInMemory()); log.info("Created Appenderator for dataSource[%s].", schema.getDataSource()); } @Override public String getDataSource() { return schema.getDataSource(); } @Override public Object startJob() { tuningConfig.getBasePersistDirectory().mkdirs(); lockBasePersistDirectory(); final Object retVal = bootstrapSinksFromDisk(); initializeExecutors(); resetNextFlush(); return retVal; } private void throwPersistErrorIfExists() { if (persistError != null) { throw new RE(persistError, "Error while persisting"); } } @Override public AppenderatorAddResult add(final SegmentIdentifier identifier, final InputRow row, @Nullable final Supplier<Committer> committerSupplier, final boolean allowIncrementalPersists) throws IndexSizeExceededException, SegmentNotWritableException { throwPersistErrorIfExists(); if (!identifier.getDataSource().equals(schema.getDataSource())) { throw new IAE("Expected dataSource[%s] but was asked to insert row for dataSource[%s]?!", schema.getDataSource(), identifier.getDataSource()); } final Sink sink = getOrCreateSink(identifier); metrics.reportMessageMaxTimestamp(row.getTimestampFromEpoch()); final int sinkRowsInMemoryBeforeAdd = sink.getNumRowsInMemory(); final int sinkRowsInMemoryAfterAdd; final long bytesInMemoryBeforeAdd = sink.getBytesInMemory(); final long bytesInMemoryAfterAdd; final IncrementalIndexAddResult addResult; try { addResult = sink.add(row, !allowIncrementalPersists); sinkRowsInMemoryAfterAdd = addResult.getRowCount(); bytesInMemoryAfterAdd = addResult.getBytesInMemory(); } catch (IndexSizeExceededException e) { // Uh oh, we can't do anything about this! We can't persist (commit metadata would be out of sync) and we // can't add the row (it just failed). This should never actually happen, though, because we check // sink.canAddRow after returning from add. log.error(e, "Sink for segment[%s] was unexpectedly full!", identifier); throw e; } if (sinkRowsInMemoryAfterAdd < 0) { throw new SegmentNotWritableException("Attempt to add row to swapped-out sink for segment[%s].", identifier); } final int numAddedRows = sinkRowsInMemoryAfterAdd - sinkRowsInMemoryBeforeAdd; rowsCurrentlyInMemory.addAndGet(numAddedRows); bytesCurrentlyInMemory.addAndGet(bytesInMemoryAfterAdd - bytesInMemoryBeforeAdd); totalRows.addAndGet(numAddedRows); boolean isPersistRequired = false; boolean persist = false; List<String> persistReasons = new ArrayList(); if (!sink.canAppendRow()) { persist = true; persistReasons.add("No more rows can be appended to sink"); } if (System.currentTimeMillis() > nextFlush) { persist = true; persistReasons.add(StringUtils.format("current time[%d] is greater than nextFlush[%d]", System.currentTimeMillis(), nextFlush)); } if (rowsCurrentlyInMemory.get() >= tuningConfig.getMaxRowsInMemory()) { persist = true; persistReasons.add(StringUtils.format("rowsCurrentlyInMemory[%d] is greater than maxRowsInMemory[%d]", rowsCurrentlyInMemory.get(), tuningConfig.getMaxRowsInMemory())); } if (bytesCurrentlyInMemory.get() >= maxBytesTuningConfig) { persist = true; persistReasons.add(StringUtils.format("bytesCurrentlyInMemory[%d] is greater than maxBytesInMemory[%d]", bytesCurrentlyInMemory.get(), maxBytesTuningConfig)); } if (persist) { if (allowIncrementalPersists) { // persistAll clears rowsCurrentlyInMemory, no need to update it. log.info("Persisting rows in memory due to: [%s]", String.join(",", persistReasons)); Futures.addCallback(persistAll(committerSupplier == null ? null : committerSupplier.get()), new FutureCallback<Object>() { @Override public void onSuccess(@Nullable Object result) { // do nothing } @Override public void onFailure(Throwable t) { persistError = t; } }); } else { isPersistRequired = true; } } return new AppenderatorAddResult(identifier, sink.getNumRows(), isPersistRequired, addResult.getParseException()); } @Override public List<SegmentIdentifier> getSegments() { return ImmutableList.copyOf(sinks.keySet()); } @Override public int getRowCount(final SegmentIdentifier identifier) { final Sink sink = sinks.get(identifier); if (sink == null) { throw new ISE("No such sink: %s", identifier); } else { return sink.getNumRows(); } } @Override public int getTotalRowCount() { return totalRows.get(); } @VisibleForTesting int getRowsInMemory() { return rowsCurrentlyInMemory.get(); } @VisibleForTesting long getBytesCurrentlyInMemory() { return bytesCurrentlyInMemory.get(); } @VisibleForTesting long getBytesInMemory(SegmentIdentifier identifier) { final Sink sink = sinks.get(identifier); if (sink == null) { throw new ISE("No such sink: %s", identifier); } else { return sink.getBytesInMemory(); } } private Sink getOrCreateSink(final SegmentIdentifier identifier) { Sink retVal = sinks.get(identifier); if (retVal == null) { retVal = new Sink(identifier.getInterval(), schema, identifier.getShardSpec(), identifier.getVersion(), tuningConfig.getMaxRowsInMemory(), maxBytesTuningConfig, tuningConfig.isReportParseExceptions(), null); try { segmentAnnouncer.announceSegment(retVal.getSegment()); } catch (IOException e) { log.makeAlert(e, "Failed to announce new segment[%s]", schema.getDataSource()) .addData("interval", retVal.getInterval()).emit(); } sinks.put(identifier, retVal); metrics.setSinkCount(sinks.size()); sinkTimeline.add(retVal.getInterval(), retVal.getVersion(), identifier.getShardSpec().createChunk(retVal)); } return retVal; } @Override public <T> QueryRunner<T> getQueryRunnerForIntervals(final Query<T> query, final Iterable<Interval> intervals) { if (texasRanger == null) { throw new IllegalStateException("Don't query me, bro."); } return texasRanger.getQueryRunnerForIntervals(query, intervals); } @Override public <T> QueryRunner<T> getQueryRunnerForSegments(final Query<T> query, final Iterable<SegmentDescriptor> specs) { if (texasRanger == null) { throw new IllegalStateException("Don't query me, bro."); } return texasRanger.getQueryRunnerForSegments(query, specs); } @Override public void clear() throws InterruptedException { // Drop commit metadata, then abandon all segments. try { throwPersistErrorIfExists(); if (persistExecutor != null) { final ListenableFuture<?> uncommitFuture = persistExecutor.submit(() -> { try { commitLock.lock(); objectMapper.writeValue(computeCommitFile(), Committed.nil()); } finally { commitLock.unlock(); } return null; }); // Await uncommit. uncommitFuture.get(); // Drop everything. final List<ListenableFuture<?>> futures = Lists.newArrayList(); for (Map.Entry<SegmentIdentifier, Sink> entry : sinks.entrySet()) { futures.add(abandonSegment(entry.getKey(), entry.getValue(), true)); } // Await dropping. Futures.allAsList(futures).get(); } } catch (ExecutionException e) { throw new RuntimeException(e); } } @Override public ListenableFuture<?> drop(final SegmentIdentifier identifier) { final Sink sink = sinks.get(identifier); if (sink != null) { return abandonSegment(identifier, sink, true); } else { return Futures.immediateFuture(null); } } @Override public ListenableFuture<Object> persistAll(@Nullable final Committer committer) { throwPersistErrorIfExists(); final Map<String, Integer> currentHydrants = Maps.newHashMap(); final List<Pair<FireHydrant, SegmentIdentifier>> indexesToPersist = Lists.newArrayList(); int numPersistedRows = 0; long bytesPersisted = 0L; for (SegmentIdentifier identifier : sinks.keySet()) { final Sink sink = sinks.get(identifier); if (sink == null) { throw new ISE("No sink for identifier: %s", identifier); } final List<FireHydrant> hydrants = Lists.newArrayList(sink); currentHydrants.put(identifier.getIdentifierAsString(), hydrants.size()); numPersistedRows += sink.getNumRowsInMemory(); bytesPersisted += sink.getBytesInMemory(); final int limit = sink.isWritable() ? hydrants.size() - 1 : hydrants.size(); for (FireHydrant hydrant : hydrants.subList(0, limit)) { if (!hydrant.hasSwapped()) { log.info("Hydrant[%s] hasn't persisted yet, persisting. Segment[%s]", hydrant, identifier); indexesToPersist.add(Pair.of(hydrant, identifier)); } } if (sink.swappable()) { indexesToPersist.add(Pair.of(sink.swap(), identifier)); } } log.info("Submitting persist runnable for dataSource[%s]", schema.getDataSource()); final String threadName = StringUtils.format("%s-incremental-persist", schema.getDataSource()); final Object commitMetadata = committer == null ? null : committer.getMetadata(); final Stopwatch runExecStopwatch = Stopwatch.createStarted(); final Stopwatch persistStopwatch = Stopwatch.createStarted(); final ListenableFuture<Object> future = persistExecutor .submit(new ThreadRenamingCallable<Object>(threadName) { @Override public Object doCall() throws IOException { try { for (Pair<FireHydrant, SegmentIdentifier> pair : indexesToPersist) { metrics.incrementRowOutputCount(persistHydrant(pair.lhs, pair.rhs)); } if (committer != null) { log.info("Committing metadata[%s] for sinks[%s].", commitMetadata, Joiner.on(", ") .join(currentHydrants .entrySet().stream().map(entry -> StringUtils .format("%s:%d", entry.getKey(), entry.getValue())) .collect(Collectors.toList()))); committer.run(); try { commitLock.lock(); final Map<String, Integer> commitHydrants = Maps.newHashMap(); final Committed oldCommit = readCommit(); if (oldCommit != null) { // merge current hydrants with existing hydrants commitHydrants.putAll(oldCommit.getHydrants()); } commitHydrants.putAll(currentHydrants); writeCommit(new Committed(commitHydrants, commitMetadata)); } finally { commitLock.unlock(); } } // return null if committer is null return commitMetadata; } catch (IOException e) { metrics.incrementFailedPersists(); throw e; } finally { metrics.incrementNumPersists(); metrics.incrementPersistTimeMillis(persistStopwatch.elapsed(TimeUnit.MILLISECONDS)); persistStopwatch.stop(); } } }); final long startDelay = runExecStopwatch.elapsed(TimeUnit.MILLISECONDS); metrics.incrementPersistBackPressureMillis(startDelay); if (startDelay > WARN_DELAY) { log.warn("Ingestion was throttled for [%,d] millis because persists were pending.", startDelay); } runExecStopwatch.stop(); resetNextFlush(); // NB: The rows are still in memory until they're done persisting, but we only count rows in active indexes. rowsCurrentlyInMemory.addAndGet(-numPersistedRows); bytesCurrentlyInMemory.addAndGet(-bytesPersisted); return future; } @Override public ListenableFuture<SegmentsAndMetadata> push(final Collection<SegmentIdentifier> identifiers, @Nullable final Committer committer, final boolean useUniquePath) { final Map<SegmentIdentifier, Sink> theSinks = Maps.newHashMap(); for (final SegmentIdentifier identifier : identifiers) { final Sink sink = sinks.get(identifier); if (sink == null) { throw new ISE("No sink for identifier: %s", identifier); } theSinks.put(identifier, sink); if (sink.finishWriting()) { totalRows.addAndGet(-sink.getNumRows()); } } return Futures.transform( // We should always persist all segments regardless of the input because metadata should be committed for all // segments. persistAll(committer), (Function<Object, SegmentsAndMetadata>) commitMetadata -> { final List<DataSegment> dataSegments = Lists.newArrayList(); for (Map.Entry<SegmentIdentifier, Sink> entry : theSinks.entrySet()) { if (droppingSinks.contains(entry.getKey())) { log.info("Skipping push of currently-dropping sink[%s]", entry.getKey()); continue; } final DataSegment dataSegment = mergeAndPush(entry.getKey(), entry.getValue(), useUniquePath); if (dataSegment != null) { dataSegments.add(dataSegment); } else { log.warn("mergeAndPush[%s] returned null, skipping.", entry.getKey()); } } return new SegmentsAndMetadata(dataSegments, commitMetadata); }, pushExecutor); } /** * Insert a barrier into the merge-and-push queue. When this future resolves, all pending pushes will have finished. * This is useful if we're going to do something that would otherwise potentially break currently in-progress * pushes. */ private ListenableFuture<?> pushBarrier() { return intermediateTempExecutor.submit((Runnable) () -> pushExecutor.submit(() -> { })); } /** * Merge segment, push to deep storage. Should only be used on segments that have been fully persisted. Must only * be run in the single-threaded pushExecutor. * * @param identifier sink identifier * @param sink sink to push * @param useUniquePath true if the segment should be written to a path with a unique identifier * * @return segment descriptor, or null if the sink is no longer valid */ private DataSegment mergeAndPush(final SegmentIdentifier identifier, final Sink sink, final boolean useUniquePath) { // Bail out if this sink is null or otherwise not what we expect. if (sinks.get(identifier) != sink) { log.warn("Sink for segment[%s] no longer valid, bailing out of mergeAndPush.", identifier); return null; } // Use a descriptor file to indicate that pushing has completed. final File persistDir = computePersistDir(identifier); final File mergedTarget = new File(persistDir, "merged"); final File descriptorFile = computeDescriptorFile(identifier); // Sanity checks for (FireHydrant hydrant : sink) { if (sink.isWritable()) { throw new ISE("WTF?! Expected sink to be no longer writable before mergeAndPush. Segment[%s].", identifier); } synchronized (hydrant) { if (!hydrant.hasSwapped()) { throw new ISE("WTF?! Expected sink to be fully persisted before mergeAndPush. Segment[%s].", identifier); } } } try { if (descriptorFile.exists()) { // Already pushed. if (useUniquePath) { // Don't reuse the descriptor, because the caller asked for a unique path. Leave the old one as-is, since // it might serve some unknown purpose. log.info("Pushing segment[%s] again with new unique path.", identifier); } else { log.info("Segment[%s] already pushed.", identifier); return objectMapper.readValue(descriptorFile, DataSegment.class); } } log.info("Pushing merged index for segment[%s].", identifier); removeDirectory(mergedTarget); if (mergedTarget.exists()) { throw new ISE("Merged target[%s] exists after removing?!", mergedTarget); } final File mergedFile; List<QueryableIndex> indexes = Lists.newArrayList(); Closer closer = Closer.create(); try { for (FireHydrant fireHydrant : sink) { Pair<Segment, Closeable> segmentAndCloseable = fireHydrant.getAndIncrementSegment(); final QueryableIndex queryableIndex = segmentAndCloseable.lhs.asQueryableIndex(); log.info("Adding hydrant[%s]", fireHydrant); indexes.add(queryableIndex); closer.register(segmentAndCloseable.rhs); } mergedFile = indexMerger.mergeQueryableIndex(indexes, schema.getGranularitySpec().isRollup(), schema.getAggregators(), mergedTarget, tuningConfig.getIndexSpec(), tuningConfig.getSegmentWriteOutMediumFactory()); } catch (Throwable t) { throw closer.rethrow(t); } finally { closer.close(); } // Retry pushing segments because uploading to deep storage might fail especially for cloud storage types final DataSegment segment = RetryUtils.retry( // The appenderator is currently being used for the local indexing task and the Kafka indexing task. For the // Kafka indexing task, pushers must use unique file paths in deep storage in order to maintain exactly-once // semantics. () -> dataSegmentPusher.push(mergedFile, sink.getSegment().withDimensions( IndexMerger.getMergedDimensionsFromQueryableIndexes(indexes)), useUniquePath), exception -> exception instanceof Exception, 5); objectMapper.writeValue(descriptorFile, segment); log.info("Pushed merged index for segment[%s], descriptor is: %s", identifier, segment); return segment; } catch (Exception e) { metrics.incrementFailedHandoffs(); log.warn(e, "Failed to push merged index for segment[%s].", identifier); throw Throwables.propagate(e); } } @Override public void close() { if (!closed.compareAndSet(false, true)) { log.info("Appenderator already closed"); return; } log.info("Shutting down..."); final List<ListenableFuture<?>> futures = Lists.newArrayList(); for (Map.Entry<SegmentIdentifier, Sink> entry : sinks.entrySet()) { futures.add(abandonSegment(entry.getKey(), entry.getValue(), false)); } try { Futures.allAsList(futures).get(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); log.warn(e, "Interrupted during close()"); } catch (ExecutionException e) { log.warn(e, "Unable to abandon existing segments during close()"); } try { shutdownExecutors(); Preconditions.checkState( persistExecutor == null || persistExecutor.awaitTermination(365, TimeUnit.DAYS), "persistExecutor not terminated"); Preconditions.checkState(pushExecutor == null || pushExecutor.awaitTermination(365, TimeUnit.DAYS), "pushExecutor not terminated"); Preconditions.checkState( intermediateTempExecutor == null || intermediateTempExecutor.awaitTermination(365, TimeUnit.DAYS), "intermediateTempExecutor not terminated"); persistExecutor = null; pushExecutor = null; intermediateTempExecutor = null; } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new ISE("Failed to shutdown executors during close()"); } // Only unlock if executors actually shut down. unlockBasePersistDirectory(); } /** * Unannounce the segments and wait for outstanding persists to finish. * Do not unlock base persist dir as we are not waiting for push executor to shut down * relying on current JVM to shutdown to not cause any locking problem if the task is restored. * In case when task is restored and current task is still active because of push executor (which it shouldn't be * since push executor starts daemon threads) then the locking should fail and new task should fail to start. * This also means that this method should only be called when task is shutting down. */ @Override public void closeNow() { if (!closed.compareAndSet(false, true)) { log.info("Appenderator already closed"); return; } log.info("Shutting down immediately..."); for (Map.Entry<SegmentIdentifier, Sink> entry : sinks.entrySet()) { try { segmentAnnouncer.unannounceSegment(entry.getValue().getSegment()); } catch (Exception e) { log.makeAlert(e, "Failed to unannounce segment[%s]", schema.getDataSource()) .addData("identifier", entry.getKey().getIdentifierAsString()).emit(); } } try { shutdownExecutors(); // We don't wait for pushExecutor to be terminated. See Javadoc for more details. Preconditions.checkState( persistExecutor == null || persistExecutor.awaitTermination(365, TimeUnit.DAYS), "persistExecutor not terminated"); Preconditions.checkState( intermediateTempExecutor == null || intermediateTempExecutor.awaitTermination(365, TimeUnit.DAYS), "intermediateTempExecutor not terminated"); persistExecutor = null; intermediateTempExecutor = null; } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new ISE("Failed to shutdown executors during close()"); } } private void lockBasePersistDirectory() { if (basePersistDirLock == null) { try { basePersistDirLockChannel = FileChannel.open(computeLockFile().toPath(), StandardOpenOption.CREATE, StandardOpenOption.WRITE); basePersistDirLock = basePersistDirLockChannel.tryLock(); if (basePersistDirLock == null) { throw new ISE("Cannot acquire lock on basePersistDir: %s", computeLockFile()); } } catch (IOException e) { throw Throwables.propagate(e); } } } private void unlockBasePersistDirectory() { try { if (basePersistDirLock != null) { basePersistDirLock.release(); basePersistDirLockChannel.close(); basePersistDirLock = null; } } catch (IOException e) { throw Throwables.propagate(e); } } private void initializeExecutors() { final int maxPendingPersists = tuningConfig.getMaxPendingPersists(); if (persistExecutor == null) { // use a blocking single threaded executor to throttle the firehose when write to disk is slow persistExecutor = MoreExecutors.listeningDecorator( Execs.newBlockingSingleThreaded("appenderator_persist_%d", maxPendingPersists)); } if (pushExecutor == null) { // use a blocking single threaded executor to throttle the firehose when write to disk is slow pushExecutor = MoreExecutors .listeningDecorator(Execs.newBlockingSingleThreaded("appenderator_merge_%d", 1)); } if (intermediateTempExecutor == null) { // use single threaded executor with SynchronousQueue so that all abandon operations occur sequentially intermediateTempExecutor = MoreExecutors .listeningDecorator(Execs.newBlockingSingleThreaded("appenderator_abandon_%d", 0)); } } private void shutdownExecutors() { if (persistExecutor != null) { persistExecutor.shutdownNow(); } if (pushExecutor != null) { pushExecutor.shutdownNow(); } if (intermediateTempExecutor != null) { intermediateTempExecutor.shutdownNow(); } } private void resetNextFlush() { nextFlush = DateTimes.nowUtc().plus(tuningConfig.getIntermediatePersistPeriod()).getMillis(); } /** * Populate "sinks" and "sinkTimeline" with committed segments, and announce them with the segmentAnnouncer. * * @return persisted commit metadata */ private Object bootstrapSinksFromDisk() { Preconditions.checkState(sinks.isEmpty(), "Already bootstrapped?!"); final File baseDir = tuningConfig.getBasePersistDirectory(); if (!baseDir.exists()) { return null; } final File[] files = baseDir.listFiles(); if (files == null) { return null; } final Committed committed; File commitFile = null; try { commitLock.lock(); commitFile = computeCommitFile(); if (commitFile.exists()) { committed = objectMapper.readValue(commitFile, Committed.class); } else { committed = Committed.nil(); } } catch (Exception e) { throw new ISE(e, "Failed to read commitFile: %s", commitFile); } finally { commitLock.unlock(); } int rowsSoFar = 0; log.info("Loading sinks from[%s]: %s", baseDir, committed.getHydrants().keySet()); for (File sinkDir : files) { final File identifierFile = new File(sinkDir, IDENTIFIER_FILE_NAME); if (!identifierFile.isFile()) { // No identifier in this sinkDir; it must not actually be a sink directory. Skip it. continue; } try { final SegmentIdentifier identifier = objectMapper.readValue(new File(sinkDir, "identifier.json"), SegmentIdentifier.class); final int committedHydrants = committed.getCommittedHydrants(identifier.getIdentifierAsString()); if (committedHydrants <= 0) { log.info("Removing uncommitted sink at [%s]", sinkDir); FileUtils.deleteDirectory(sinkDir); continue; } // To avoid reading and listing of "merged" dir and other special files final File[] sinkFiles = sinkDir.listFiles((dir, fileName) -> !(Ints.tryParse(fileName) == null)); Arrays.sort(sinkFiles, (o1, o2) -> Ints.compare(Integer.parseInt(o1.getName()), Integer.parseInt(o2.getName()))); List<FireHydrant> hydrants = Lists.newArrayList(); for (File hydrantDir : sinkFiles) { final int hydrantNumber = Integer.parseInt(hydrantDir.getName()); if (hydrantNumber >= committedHydrants) { log.info("Removing uncommitted segment at [%s]", hydrantDir); FileUtils.deleteDirectory(hydrantDir); } else { log.info("Loading previously persisted segment at [%s]", hydrantDir); if (hydrantNumber != hydrants.size()) { throw new ISE("Missing hydrant [%,d] in sinkDir [%s].", hydrants.size(), sinkDir); } hydrants.add(new FireHydrant(new QueryableIndexSegment(identifier.getIdentifierAsString(), indexIO.loadIndex(hydrantDir)), hydrantNumber)); } } // Make sure we loaded enough hydrants. if (committedHydrants != hydrants.size()) { throw new ISE("Missing hydrant [%,d] in sinkDir [%s].", hydrants.size(), sinkDir); } Sink currSink = new Sink(identifier.getInterval(), schema, identifier.getShardSpec(), identifier.getVersion(), tuningConfig.getMaxRowsInMemory(), maxBytesTuningConfig, tuningConfig.isReportParseExceptions(), null, hydrants); rowsSoFar += currSink.getNumRows(); sinks.put(identifier, currSink); sinkTimeline.add(currSink.getInterval(), currSink.getVersion(), identifier.getShardSpec().createChunk(currSink)); segmentAnnouncer.announceSegment(currSink.getSegment()); } catch (IOException e) { log.makeAlert(e, "Problem loading sink[%s] from disk.", schema.getDataSource()) .addData("sinkDir", sinkDir).emit(); } } // Make sure we loaded all committed sinks. final Set<String> loadedSinks = Sets .newHashSet(Iterables.transform(sinks.keySet(), input -> input.getIdentifierAsString())); final Set<String> missingSinks = Sets.difference(committed.getHydrants().keySet(), loadedSinks); if (!missingSinks.isEmpty()) { throw new ISE("Missing committed sinks [%s]", Joiner.on(", ").join(missingSinks)); } totalRows.set(rowsSoFar); return committed.getMetadata(); } private ListenableFuture<?> abandonSegment(final SegmentIdentifier identifier, final Sink sink, final boolean removeOnDiskData) { // Ensure no future writes will be made to this sink. if (sink.finishWriting()) { // Decrement this sink's rows from the counters. we only count active sinks so that we don't double decrement, // i.e. those that haven't been persisted for *InMemory counters, or pushed to deep storage for the total counter. rowsCurrentlyInMemory.addAndGet(-sink.getNumRowsInMemory()); bytesCurrentlyInMemory.addAndGet(-sink.getBytesInMemory()); totalRows.addAndGet(-sink.getNumRows()); } // Mark this identifier as dropping, so no future push tasks will pick it up. droppingSinks.add(identifier); // Wait for any outstanding pushes to finish, then abandon the segment inside the persist thread. return Futures.transform(pushBarrier(), new Function<Object, Object>() { @Nullable @Override public Object apply(@Nullable Object input) { if (sinks.get(identifier) != sink) { // Only abandon sink if it is the same one originally requested to be abandoned. log.warn("Sink for segment[%s] no longer valid, not abandoning.", identifier); return null; } if (removeOnDiskData) { // Remove this segment from the committed list. This must be done from the persist thread. log.info("Removing commit metadata for segment[%s].", identifier); try { commitLock.lock(); final Committed oldCommit = readCommit(); if (oldCommit != null) { writeCommit(oldCommit.without(identifier.getIdentifierAsString())); } } catch (Exception e) { log.makeAlert(e, "Failed to update committed segments[%s]", schema.getDataSource()) .addData("identifier", identifier.getIdentifierAsString()).emit(); throw Throwables.propagate(e); } finally { commitLock.unlock(); } } // Unannounce the segment. try { segmentAnnouncer.unannounceSegment(sink.getSegment()); } catch (Exception e) { log.makeAlert(e, "Failed to unannounce segment[%s]", schema.getDataSource()) .addData("identifier", identifier.getIdentifierAsString()).emit(); } log.info("Removing sink for segment[%s].", identifier); sinks.remove(identifier); metrics.setSinkCount(sinks.size()); droppingSinks.remove(identifier); sinkTimeline.remove(sink.getInterval(), sink.getVersion(), identifier.getShardSpec().createChunk(sink)); for (FireHydrant hydrant : sink) { if (cache != null) { cache.close(SinkQuerySegmentWalker.makeHydrantCacheIdentifier(hydrant)); } hydrant.swapSegment(null); } if (removeOnDiskData) { removeDirectory(computePersistDir(identifier)); } return null; } }, // use persistExecutor to make sure that all the pending persists completes before // starting to abandon segments persistExecutor); } private Committed readCommit() throws IOException { final File commitFile = computeCommitFile(); if (commitFile.exists()) { // merge current hydrants with existing hydrants return objectMapper.readValue(commitFile, Committed.class); } else { return null; } } private void writeCommit(Committed newCommit) throws IOException { final File commitFile = computeCommitFile(); objectMapper.writeValue(commitFile, newCommit); } private File computeCommitFile() { return new File(tuningConfig.getBasePersistDirectory(), "commit.json"); } private File computeLockFile() { return new File(tuningConfig.getBasePersistDirectory(), ".lock"); } private File computePersistDir(SegmentIdentifier identifier) { return new File(tuningConfig.getBasePersistDirectory(), identifier.getIdentifierAsString()); } private File computeIdentifierFile(SegmentIdentifier identifier) { return new File(computePersistDir(identifier), IDENTIFIER_FILE_NAME); } private File computeDescriptorFile(SegmentIdentifier identifier) { return new File(computePersistDir(identifier), "descriptor.json"); } private File createPersistDirIfNeeded(SegmentIdentifier identifier) throws IOException { final File persistDir = computePersistDir(identifier); FileUtils.forceMkdir(persistDir); objectMapper.writeValue(computeIdentifierFile(identifier), identifier); return persistDir; } /** * Persists the given hydrant and returns the number of rows persisted. Must only be called in the single-threaded * persistExecutor. * * @param indexToPersist hydrant to persist * @param identifier the segment this hydrant is going to be part of * * @return the number of rows persisted */ private int persistHydrant(FireHydrant indexToPersist, SegmentIdentifier identifier) { synchronized (indexToPersist) { if (indexToPersist.hasSwapped()) { log.info("Segment[%s], Hydrant[%s] already swapped. Ignoring request to persist.", identifier, indexToPersist); return 0; } log.info("Segment[%s], persisting Hydrant[%s]", identifier, indexToPersist); try { int numRows = indexToPersist.getIndex().size(); final File persistedFile; final File persistDir = createPersistDirIfNeeded(identifier); final IndexSpec indexSpec = tuningConfig.getIndexSpec(); persistedFile = indexMerger.persist(indexToPersist.getIndex(), identifier.getInterval(), new File(persistDir, String.valueOf(indexToPersist.getCount())), indexSpec, tuningConfig.getSegmentWriteOutMediumFactory()); indexToPersist.swapSegment(new QueryableIndexSegment(indexToPersist.getSegmentIdentifier(), indexIO.loadIndex(persistedFile))); return numRows; } catch (IOException e) { log.makeAlert("dataSource[%s] -- incremental persist failed", schema.getDataSource()) .addData("segment", identifier.getIdentifierAsString()) .addData("count", indexToPersist.getCount()).emit(); throw Throwables.propagate(e); } } } private void removeDirectory(final File target) { if (target.exists()) { try { log.info("Deleting Index File[%s]", target); FileUtils.deleteDirectory(target); } catch (Exception e) { log.makeAlert(e, "Failed to remove directory[%s]", schema.getDataSource()).addData("file", target) .emit(); } } } }