Java tutorial
/** * Copyright 2015 freiheit.com technologies gmbh * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.freiheit.fuava.simplebatch; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.stream.StreamSupport; import javax.annotation.CheckReturnValue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.freiheit.fuava.simplebatch.fetch.FetchedItem; import com.freiheit.fuava.simplebatch.fetch.Fetcher; import com.freiheit.fuava.simplebatch.processor.Processor; import com.freiheit.fuava.simplebatch.processor.TimeLoggingProcessor; import com.freiheit.fuava.simplebatch.result.DelegatingProcessingResultListener; import com.freiheit.fuava.simplebatch.result.ProcessingResultListener; import com.freiheit.fuava.simplebatch.result.Result; import com.freiheit.fuava.simplebatch.result.ResultStatistics; import com.google.common.base.Preconditions; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; /** * Downloads - or more generally, processes - data in two stages, via iterables. * * The output of the first stage is used as input for the second stage. * * A typical usecase would be to fetch Ids or URLs of the data to download in * the first stage ('source'), and then to fetch the data in the second stage * ('process'), persisting the data and the progress information in the third * stage 'sink'. * * The downloader uses iterables and does the stage 2 downloading in batches, so * you can provide iterables that stream over some data source, efficiently * processing huge amounts of data. * * @author Klas Kalass <klas.kalass@freiheit.com> * * @param <OriginalInput> * The data fetched by the fetcher * @param <Output> * The result of the processor */ public class BatchJob<OriginalInput, Output> { private static final Logger LOG = LoggerFactory.getLogger(BatchJob.class); public static final int TERMINATION_TIMEOUT_HOURS = 96; public static final int PANIC_VM_ERROR = 1; private static final class BatchJobThreadGroup extends ThreadGroup { private final PanicCallback panicCallback; private BatchJobThreadGroup(final String name, final PanicCallback callback) { super(name); this.panicCallback = callback; } @Override public void uncaughtException(final Thread t, final Throwable e) { try { LOG.error(e.getMessage(), e); } catch (final Throwable t2) { panicCallback.panic("Thread died while logging", PANIC_VM_ERROR); return; } if (e instanceof VirtualMachineError) { panicCallback.panic("Thread died with VirtualMachineError", PANIC_VM_ERROR); } } } private final class CallProcessor implements Consumer<List<Result<FetchedItem<OriginalInput>, OriginalInput>>> { private final DelegatingProcessingResultListener<OriginalInput, Output> listeners; private final PanicCallback panicCallback; private CallProcessor(final DelegatingProcessingResultListener<OriginalInput, Output> listeners, final PanicCallback panicCallback) { this.listeners = listeners; this.panicCallback = Preconditions.checkNotNull(panicCallback); } @Override public void accept(final List<Result<FetchedItem<OriginalInput>, OriginalInput>> sourceResults) { try { listeners.onFetchResults(sourceResults); final Iterable<? extends Result<FetchedItem<OriginalInput>, Output>> processingResults = persistence .process(sourceResults); listeners.onProcessingResults(processingResults); } catch (final VirtualMachineError e) { LOG.error( "FATAL: Exception went through the Processors. You need to ensure that this cannot happen, in order to achieve proper error handling " + e.getMessage(), e); /* * There is no way we can get out of this. We need to ensure the entire program halts - thus we do a system */ panicCallback.panic("Virtual Machine Error", PANIC_VM_ERROR); } catch (final Throwable t) { LOG.error( "FATAL: Exception went through the Processors. You need to ensure that this cannot happen, in order to achieve proper error handling" + t.getMessage(), t); } } } public static class Builder<OriginalInput, Output> { private int processingBatchSize = 1000; private boolean parallel = false; private Integer numParallelThreads = null; private int parallelTerminationTimeoutHours = TERMINATION_TIMEOUT_HOURS; private boolean printFinalTimeMeasures = true; private Fetcher<OriginalInput> fetcher; private PanicCallback panicCallback; private Processor<FetchedItem<OriginalInput>, OriginalInput, Output> processor; private final ArrayList<ProcessingResultListener<OriginalInput, Output>> listeners = new ArrayList<ProcessingResultListener<OriginalInput, Output>>(); private String description; public Builder() { } public int getProcessingBatchSize() { return processingBatchSize; } public boolean isParallel() { return parallel; } /** * Set to false to process in current thread. Set true to use multiple Threads for processing (each chunk is thread confined though). * @return this for method chaining */ public Builder<OriginalInput, Output> setParallel(final boolean parallel) { this.parallel = parallel; return this; } /** * Set the callback for 'panic' situations like virtual machine errors where it makes no sense to try and continue processing. * Default behaviour is, that System.exit() is called. * @param panicCallback The callback for panic situations * @return this instance for method chaining */ public Builder<OriginalInput, Output> setPanicCallback(final PanicCallback panicCallback) { this.panicCallback = panicCallback; return this; } /** * Set the number of hours this jobs waits for the spawned chunks to be processed. * Applies only if you use {@link #setNumParallelThreads(Integer)}. * * @return this for method chaining */ public Builder<OriginalInput, Output> setParallelTerminationTimeoutHours( final int parallelTerminationTimeoutHours) { this.parallelTerminationTimeoutHours = parallelTerminationTimeoutHours; return this; } public int getParallelTerminationTimeoutHours() { return parallelTerminationTimeoutHours; } /** * The number of threads to use for parallel processing. If set to null and parallel is set to true, Java 8 parallel streaming will be used. * @return this for method chaining */ public Builder<OriginalInput, Output> setNumParallelThreads(final Integer numParallelThreads) { this.numParallelThreads = numParallelThreads; return this; } public Integer getNumParallelThreads() { return numParallelThreads; } public boolean isPrintFinalTimeMeasures() { return printFinalTimeMeasures; } /** * Whether or not the final performance measures should be printed after run has finished * @return this for method chaining */ public Builder<OriginalInput, Output> setPrintFinalTimeMeasures(final boolean printFinalTimeMeasures) { this.printFinalTimeMeasures = printFinalTimeMeasures; return this; } /** * How many items from the fetcher are put together in one chunk and processed together * @return this for method chaining */ public Builder<OriginalInput, Output> setProcessingBatchSize(final int processingBatchSize) { this.processingBatchSize = processingBatchSize; return this; } /** * The fetcher that produces the items to process. Should be fast * @return this for method chaining */ public Builder<OriginalInput, Output> setFetcher(final Fetcher<OriginalInput> idsFetcher) { this.fetcher = idsFetcher; return this; } public Fetcher<OriginalInput> getFetcher() { return fetcher; } /** * The processor for processing chunks of items which were produced by the fetcher. May be slow. * @return this for method chaining */ public Builder<OriginalInput, Output> setProcessor( final Processor<FetchedItem<OriginalInput>, OriginalInput, Output> writer) { this.processor = writer; return this; } public Processor<FetchedItem<OriginalInput>, OriginalInput, Output> getProcessor() { return processor; } /** * Add a listener to call when processing events happen * @return this for method chaining */ public Builder<OriginalInput, Output> addListener( final ProcessingResultListener<OriginalInput, Output> listener) { this.listeners.add(listener); return this; } /** * Add listeners to call when processing events happen * @return this for method chaining */ public Builder<OriginalInput, Output> addListeners( final Collection<ProcessingResultListener<OriginalInput, Output>> listeners) { this.listeners.addAll(listeners); return this; } public Builder<OriginalInput, Output> removeListener( final ProcessingResultListener<OriginalInput, Output> listener) { this.listeners.remove(listener); return this; } public Builder<OriginalInput, Output> removeListeners( final Collection<ProcessingResultListener<OriginalInput, Output>> listeners) { this.listeners.removeAll(listeners); return this; } public ArrayList<ProcessingResultListener<OriginalInput, Output>> getListeners() { return listeners; } /** * The Description of the job * @return this for method chaining */ public Builder<OriginalInput, Output> setDescription(final String desc) { this.description = desc; return this; } public BatchJob<OriginalInput, Output> build() { final PanicCallback panicCallback = getPanicCallback(); return new BatchJob<OriginalInput, Output>(description, processingBatchSize, parallel, numParallelThreads, parallelTerminationTimeoutHours, fetcher, processor, printFinalTimeMeasures, listeners, panicCallback); } public PanicCallback getPanicCallback() { return this.panicCallback == null ? new DefaultPanicCallback() : this.panicCallback; } public String getDescription() { return description; } } private final int processingBatchSize; private final boolean parallel; private final Integer numParallelThreads; private final Fetcher<OriginalInput> fetcher; private final Processor<FetchedItem<OriginalInput>, OriginalInput, Output> persistence; private final List<ProcessingResultListener<OriginalInput, Output>> listeners; private final String description; private final boolean printFinalTimeMeasures; private final int parallelTerminationTimeoutHours; private final PanicCallback panicCallback; /** * Callback for severe error conditions which should lead to aborting the entire processing. * Called for {@link VirtualMachineError}s such as {@link OutOfMemoryError}. */ public interface PanicCallback { void panic(String reason, int code); } /** * Default implementation of the panic callback which simply calls System.exit. */ public static final class DefaultPanicCallback implements PanicCallback { @Override public void panic(final String reason, final int code) { final String message = "System exit (" + code + ") due to: " + reason; LOG.error(message); System.err.println(message); System.exit(code); } } /** * @param description The Description of the job * @param processingBatchSize How many items from the fetcher are put together in one chunk and processed together * @param parallel false: process in current thread. true: use multiple Threads for processing (each chunk is thread confined though) * @param numParallelThreads the number of threads to use for parallel processing. If null and parallel is set to true, Java 8 parallel streaming will be used. * @param fetcher The fetcher that produces the items to process. Should be fast * @param processor The processor for processing chunks of items which were produced by the fetcher. May be slow. * @param printFinalTimeMeasures Wether or not the final performance measures should be printed after run has finished * @param listeners Listeners to call when processing events happen */ protected BatchJob(final String description, final int processingBatchSize, final boolean parallel, final Integer numParallelThreads, final int parallelTerminationTimeoutHours, final Fetcher<OriginalInput> fetcher, final Processor<FetchedItem<OriginalInput>, OriginalInput, Output> processor, final boolean printFinalTimeMeasures, final List<ProcessingResultListener<OriginalInput, Output>> listeners, final PanicCallback panicCallback) { this.description = description; this.processingBatchSize = processingBatchSize; this.parallel = parallel; this.numParallelThreads = numParallelThreads; this.parallelTerminationTimeoutHours = parallelTerminationTimeoutHours; this.fetcher = fetcher; this.persistence = processor; this.printFinalTimeMeasures = printFinalTimeMeasures; this.listeners = ImmutableList.copyOf(listeners); this.panicCallback = Preconditions.checkNotNull(panicCallback, "Panic Callback must be set"); } public static <Input, Output> Builder<Input, Output> builder() { return new Builder<Input, Output>(); } @CheckReturnValue public ResultStatistics run() { final ResultStatistics.Builder<OriginalInput, Output> resultBuilder = ResultStatistics.builder(); final DelegatingProcessingResultListener<OriginalInput, Output> listeners = new DelegatingProcessingResultListener<OriginalInput, Output>( ImmutableList.<ProcessingResultListener<OriginalInput, Output>>builder().add(resultBuilder) .addAll(this.listeners).build()); listeners.onBeforeRun(this.description); final Iterable<Result<FetchedItem<OriginalInput>, OriginalInput>> sourceIterable = fetcher.fetchAll(); if (sourceIterable instanceof Collection && this.persistence instanceof TimeLoggingProcessor) { // Iterables could be lazy, but if it is a collection it should not be lazy so we can // count and report the result of the prepare stage. final Collection<?> collection = (Collection<?>) sourceIterable; ((TimeLoggingProcessor<?, ?, ?>) this.persistence).addNumPreparedItems(collection.size(), FluentIterable.from(collection).filter(o -> ((Result<?, ?>) o).isSuccess()).size(), FluentIterable.from(collection).filter(o -> ((Result<?, ?>) o).isFailed()).size()); } process(listeners, sourceIterable); listeners.onAfterRun(); resultBuilder.setListenerDelegationFailures(listeners.hasDelegationFailures()); final ResultStatistics statistics = resultBuilder.build(); if (this.printFinalTimeMeasures && this.persistence instanceof TimeLoggingProcessor) { ((TimeLoggingProcessor<?, ?, ?>) this.persistence).logFinalCounts(); } return statistics; } protected void process(final DelegatingProcessingResultListener<OriginalInput, Output> listeners, final Iterable<Result<FetchedItem<OriginalInput>, OriginalInput>> sourceIterable) { if (this.parallel && this.numParallelThreads != null && this.numParallelThreads.intValue() > 0) { processWithBlockingQueue(listeners, this.numParallelThreads, sourceIterable); } else { processWithStreams(listeners, this.parallel, sourceIterable); } } protected void processWithStreams(final DelegatingProcessingResultListener<OriginalInput, Output> listeners, final boolean useParallelStream, final Iterable<Result<FetchedItem<OriginalInput>, OriginalInput>> sourceIterable) { final Iterable<List<Result<FetchedItem<OriginalInput>, OriginalInput>>> partitions = Iterables .partition(sourceIterable, processingBatchSize); StreamSupport.stream(partitions.spliterator(), parallel) .forEach(new CallProcessor(listeners, panicCallback)); } private void processWithBlockingQueue(final DelegatingProcessingResultListener<OriginalInput, Output> listeners, final int numParallelThreads, final Iterable<Result<FetchedItem<OriginalInput>, OriginalInput>> sourceIterable) { final ThreadGroup threadGroup = new BatchJobThreadGroup("Simplebatch Processing", this.panicCallback); try { new BlockingQueueExecutor<OriginalInput>(numParallelThreads, processingBatchSize, TimeUnit.HOURS.toMillis(this.parallelTerminationTimeoutHours), new CallProcessor(listeners, panicCallback), threadGroup).accept(sourceIterable); } finally { threadGroup.destroy(); } } }