Java tutorial
/******************************************************************************* * Copyright 2015, The IKANOW Open Source Project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package com.ikanow.aleph2.storage_service_hdfs.services; import java.io.IOException; import java.io.OutputStream; import java.time.Duration; import java.util.Collections; import java.util.Date; import java.util.EnumSet; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.http.impl.cookie.DateUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import scala.Tuple2; import com.fasterxml.jackson.databind.JsonNode; import com.ikanow.aleph2.data_model.interfaces.data_services.IStorageService; import com.ikanow.aleph2.data_model.interfaces.shared_services.ICrudService; import com.ikanow.aleph2.data_model.interfaces.shared_services.IDataServiceProvider.IGenericDataService; import com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService; import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean; import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean; import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean.StorageSchemaBean; import com.ikanow.aleph2.data_model.utils.BeanTemplateUtils; import com.ikanow.aleph2.data_model.utils.ErrorUtils; import com.ikanow.aleph2.data_model.utils.FutureUtils; import com.ikanow.aleph2.data_model.utils.Lambdas; import com.ikanow.aleph2.data_model.utils.Optionals; import com.ikanow.aleph2.data_model.utils.Patterns; import com.ikanow.aleph2.data_model.utils.SetOnce; import com.ikanow.aleph2.data_model.utils.TimeUtils; import com.ikanow.aleph2.data_model.utils.Tuples; import com.ikanow.aleph2.data_model.utils.UuidUtils; import com.ikanow.aleph2.storage_service_hdfs.utils.HdfsErrorUtils; import fj.Unit; /** Generic service for writing data out to HDFS * @author Alex */ public class HfdsDataWriteService<T> implements IDataWriteService<T> { protected static final Logger _logger = LogManager.getLogger(); //TODO (ALEPH-12): doesn't seem to be working with processed (netflow/sample) ///////////////////////////////////////////////////////////// // TOP LEVEL SERVICE protected final DataBucketBean _bucket; protected final IStorageService.StorageStage _stage; protected final FileContext _dfs; protected final IStorageService _storage_service; protected final String _buffer_name; protected final Optional<String> _job_name; protected final IGenericDataService _parent; // (currently just share on of these across all users of this service, basically across the process/classloader) protected final SetOnce<BatchHdfsWriteService> _writer = new SetOnce<>(); protected final static String _process_id = UuidUtils.get().getRandomUuid().substring(14); /** User constructor * @param bucket */ public HfdsDataWriteService(final DataBucketBean bucket, final IGenericDataService parent, final IStorageService.StorageStage stage, final Optional<String> job_name, final IStorageService storage_service, final Optional<String> secondary_buffer) { _bucket = bucket; _stage = stage; _buffer_name = secondary_buffer.orElse(IStorageService.PRIMARY_BUFFER_SUFFIX); _storage_service = storage_service; _job_name = job_name; _dfs = storage_service.getUnderlyingPlatformDriver(FileContext.class, Optional.empty()).get(); _parent = parent; } /** Lazy initialization for batch writer */ public void setup() { if (!_writer.isSet()) { _writer.trySet(new BatchHdfsWriteService()); } } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#storeObject(java.lang.Object) */ @Override public CompletableFuture<Supplier<Object>> storeObject(T new_object) { setup(); _writer.get().storeObject(new_object); return CompletableFuture.completedFuture(() -> { return null; }); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#storeObjects(java.util.List) */ @Override public CompletableFuture<Tuple2<Supplier<List<Object>>, Supplier<Long>>> storeObjects(List<T> new_objects) { setup(); _writer.get().storeObjects(new_objects); return CompletableFuture .completedFuture(Tuples._2T(() -> Collections.emptyList(), () -> (long) new_objects.size())); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#countObjects() */ @Override public CompletableFuture<Long> countObjects() { // (return a future exception) return FutureUtils.returnError( new RuntimeException(ErrorUtils.get(HdfsErrorUtils.OPERATION_NOT_SUPPORTED, "countObjects"))); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#deleteDatastore() */ @Override public CompletableFuture<Boolean> deleteDatastore() { return _parent .handleBucketDeletionRequest(_bucket, Optional.of(_buffer_name) .filter(name -> !name.equals(IStorageService.PRIMARY_BUFFER_SUFFIX)), false) .thenApply(res -> res.success()); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#getCrudService() */ @Override public Optional<ICrudService<T>> getCrudService() { throw new RuntimeException(ErrorUtils.get(HdfsErrorUtils.OPERATION_NOT_SUPPORTED, "getCrudService")); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#getRawService() */ @Override public IDataWriteService<JsonNode> getRawService() { return new HfdsDataWriteService<JsonNode>(_bucket, _parent, _stage, _job_name, _storage_service, Optional.of(_buffer_name)); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#getUnderlyingPlatformDriver(java.lang.Class, java.util.Optional) */ @Override public <X> Optional<X> getUnderlyingPlatformDriver(Class<X> driver_class, Optional<String> driver_options) { return Optional.empty(); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService#getBatchWriteSubservice() */ @Override public Optional<IBatchSubservice<T>> getBatchWriteSubservice() { setup(); return Optional.of(_writer.get()); } ///////////////////////////////////////////////////////////// /** BATCH SUB SERVICE * @author alex */ public class BatchHdfsWriteService implements IBatchSubservice<T> { final protected LinkedBlockingQueue<Object> _shared_queue = new LinkedBlockingQueue<>(); public class MutableState { int max_objects = 5000; // (5K objects) long size_kb = 20L * 1024L; // (20MB) Duration flush_interval = Duration.ofMinutes(10L); // (10 minutes) int write_threads = 2; ThreadPoolExecutor _workers = null; } final protected MutableState _state = new MutableState(); final protected SetOnce<Unit> _initialized = new SetOnce<>(); // (lazy initialization) /** Lazy initialization of the writers */ protected void setup() { if (!_initialized.isSet()) { _initialized.trySet(Unit.unit()); // Launch the executor service fillUpEmptyQueue(); // This is ugly but safest, now apply the batch Optionals.of(() -> _bucket.data_schema().storage_schema()) .map(store -> getStorageSubSchema(store, _stage)) .map(subschema -> subschema.target_write_settings()).ifPresent(writer -> { setBatchProperties(Optional.<Integer>ofNullable(writer.batch_max_objects()), Optional.<Long>ofNullable(writer.batch_max_size_kb()), Optional.ofNullable(writer.batch_flush_interval()) .map(secs -> Duration.ofSeconds(secs)), Optional.<Integer>ofNullable(writer.target_write_concurrency())); }); } } /** User constructor */ public BatchHdfsWriteService() { //(do nothing, lazy initialization) } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService.IBatchSubservice#setBatchProperties(java.util.Optional, java.util.Optional, java.util.Optional, java.util.Optional) */ @Override public void setBatchProperties(Optional<Integer> max_objects, Optional<Long> size_kb, Optional<Duration> flush_interval, Optional<Integer> write_threads) { synchronized (this) { _state.max_objects = max_objects.orElse(_state.max_objects); _state.size_kb = size_kb.orElse(_state.size_kb); _state.flush_interval = flush_interval.orElse(_state.flush_interval); // (write threads change ignore for now) int old_write_threads = _state.write_threads; _state.write_threads = write_threads.orElse(_state.write_threads); if ((old_write_threads != _state.write_threads) && _initialized.isSet()) { if (old_write_threads < _state.write_threads) { // easy case, just expand _state._workers.setCorePoolSize(_state.write_threads); _state._workers.setMaximumPoolSize(_state.write_threads); for (int i = old_write_threads; i < _state.write_threads; ++i) { _state._workers.execute(new WriterWorker()); } } else if (old_write_threads > _state.write_threads) { // this is a bit ugly, nuke the existing worker queue _state._workers.shutdownNow(); try { boolean completed = _state._workers.awaitTermination(5, TimeUnit.SECONDS); if (!completed) { _logger.warn("(workers not completed before timeout expired: " + _state._workers.toString() + ")"); } } catch (Exception e) { } fillUpEmptyQueue(); } } } setup(); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService.IBatchSubservice#storeObjects(java.util.List) */ @Override public void storeObjects(List<T> new_objects) { setup(); _shared_queue.add(new_objects); } /* (non-Javadoc) * @see com.ikanow.aleph2.data_model.interfaces.shared_services.IDataWriteService.IBatchSubservice#storeObject(java.lang.Object) */ @Override public void storeObject(T new_object) { setup(); _shared_queue.add(new_object); } //////////////////////////////////////// // UTILITY /** Fills up queue */ private void fillUpEmptyQueue() { _state._workers = new ThreadPoolExecutor(_state.write_threads, _state.write_threads, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>()); for (int i = 0; i < _state.write_threads; ++i) { _state._workers.execute(new WriterWorker()); } } } ///////////////////////////////////////////////////////////// // BATCH SUB SERVICE - WORKER THREAD /** A worker thread * @author alex */ public class WriterWorker implements Runnable { protected static final String SPOOL_DIR = "/.spooldir/"; public class MutableState { boolean terminate = false; Optional<String> codec = Optional.empty(); int segment = 1; int curr_objects; long curr_size_b; long last_segmented; Path curr_path; OutputStream out; } final protected MutableState _state = new MutableState(); final protected String _thread_id = UuidUtils.get().getRandomUuid().substring(14); /* (non-Javadoc) * @see java.lang.Runnable#run() */ @Override public void run() { _logger.info("Starting HDFS worker thread: " + getFilename()); Runtime.getRuntime().addShutdownHook(new Thread(Lambdas.wrap_runnable_i(() -> { _state.terminate = true; complete_segment(); }))); // (Some internal mutable state - these values are _always_ overwritten) boolean more_objects = false; int max_objects = 5000; // (5K objects) long size_b = 20L * 1024L * 1024L; // (20MB) Duration flush_interval = Duration.ofMinutes(10L); // (10 minutes) long timeout_ns = flush_interval.toNanos(); long timeout_ms = timeout_ns * 1000L; try { for (; !_state.terminate && !Thread.interrupted();) { if (!more_objects) { synchronized (_writer) { max_objects = _writer.get()._state.max_objects; size_b = _writer.get()._state.size_kb * 1024L; flush_interval = _writer.get()._state.flush_interval; timeout_ns = flush_interval.toNanos(); timeout_ms = timeout_ns * 1000L; } } Object o = _writer.get()._shared_queue.poll(timeout_ns, TimeUnit.NANOSECONDS); if (null == o) { complete_segment(); more_objects = false; continue; } if (null == _state.out) { new_segment(); } write(o); if (check_segment(max_objects, size_b, timeout_ms)) { complete_segment(); more_objects = false; } else { more_objects = null != _writer.get()._shared_queue.peek(); } } } catch (Exception e) { // assume this is an interrupted error and fall through to.... } try { // always try to complete current segment before exiting complete_segment(); } catch (Exception ee) { } _logger.info("Terminating HDFS worker thread: " + getFilename()); } /** Write the object(s) out to the stream * @param o * @return * @throws IOException */ protected void write(final Object o) throws IOException { String s = null; if (o instanceof List) { @SuppressWarnings({ "rawtypes", "unchecked" }) List<Object> l = (List) o; l.stream().forEach(Lambdas.wrap_consumer_u(ol -> write(ol))); return; } else if (o instanceof String) { s = ((String) o); if (!s.endsWith("\n")) s += "\n"; //(i think it will a fair bit) } else if (o instanceof JsonNode) { s = ((JsonNode) o).toString() + "\n"; } else { s = BeanTemplateUtils.toJson(o).toString() + "\n"; } _state.out.write(s.getBytes()); _state.curr_objects++; _state.curr_size_b += s.getBytes().length; } /** Utility to check the file vs time and siz * @param max_objects * @param max_size * @return */ protected boolean check_segment(final int max_objects, final long max_size_b, final long max_duration_ms) { final long now = System.currentTimeMillis(); boolean trigger = ((_state.curr_objects > max_objects) || (_state.curr_size_b > max_size_b) || ((now - _state.last_segmented) > max_duration_ms) || (now < _state.last_segmented) // (clock has changed so trigger immediately) ); //DEBUG //if (trigger) System.out.println("TRIGGER NOW: obj=" + (_state.curr_objects > max_objects) + " vs size=" + (_state.curr_size_b > max_size_b) + " vs time=" + ((now - _state.last_segmented) > max_duration_ms)); return trigger; } /** Create a new segment * @throws Exception */ protected void new_segment() throws Exception { if (null == _state.out) { // (otherwise we already have a segment) _state.last_segmented = System.currentTimeMillis(); _state.curr_size_b = 0L; _state.curr_objects = 0; _state.codec = getCanonicalCodec(_bucket.data_schema().storage_schema(), _stage); // (recheck the codec) _state.curr_path = new Path( getBasePath(_storage_service.getBucketRootPath(), _bucket, _stage, _job_name, _buffer_name) + "/" + SPOOL_DIR + "/" + getFilename()); try { _dfs.mkdir(_state.curr_path.getParent(), FsPermission.getDefault(), true); } catch (Exception e) { } _state.out = wrapOutputInCodec(_state.codec, _dfs.create(_state.curr_path, EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE))); } } /** Completes an existing segment * @throws IOException */ protected synchronized void complete_segment() throws IOException { if ((null != _state.out) && (_state.curr_objects > 0)) { _state.out.close(); _state.out = null; _state.segment++; final Date now = new Date(); final Path path = new Path( getBasePath(_storage_service.getBucketRootPath(), _bucket, _stage, _job_name, _buffer_name) + "/" + getSuffix(now, _bucket, _stage) + "/" + _state.curr_path.getName()); try { _dfs.mkdir(path.getParent(), FsPermission.getDefault(), true); } catch (Exception e) { } // (fails if already exists?) _dfs.rename(_state.curr_path, path); try { _dfs.rename(getCrc(_state.curr_path), getCrc(path)); } catch (Exception e) { } // (don't care what the error is) } } /** Gets the CRC version of a file * @param p * @return */ private Path getCrc(final Path p) { return new Path(p.getParent() + "/" + "." + p.getName() + ".crc"); } /** Returns the filename corresponding to this object * @return */ protected String getFilename() { final String suffix = getExtension(_stage) + _state.codec.map(s -> "." + s).orElse(""); return _process_id + "_" + _thread_id + "_" + _state.segment + suffix; } } ////////////////////////////////////////////////////////////////////// // UTILITIES /** Returns the codec string (normalized, eg "gzip" -> "gz") * @param storage_schema * @param stage * @return */ public static Optional<String> getCanonicalCodec(final DataSchemaBean.StorageSchemaBean storage_schema, final IStorageService.StorageStage stage) { return Optional.ofNullable(getStorageSubSchema(storage_schema, stage)).map(ss -> ss.codec()).map(codec -> { if (codec.equalsIgnoreCase("gzip")) { return "gz"; } if (codec.equalsIgnoreCase("snappy")) { return "sz"; } if (codec.equalsIgnoreCase("snappy_framed")) { return "fr.sz"; } else return codec; }).map(String::toLowerCase); } /** Wraps an output stream in one of the supported codecs * @param codec * @param original_output * @return */ public static OutputStream wrapOutputInCodec(final Optional<String> codec, final OutputStream original_output) { return codec.map(Lambdas.wrap_u(c -> { if (c.equals("gz")) { return new java.util.zip.GZIPOutputStream(original_output); } else if (c.equals("sz")) { return new org.xerial.snappy.SnappyOutputStream(original_output); } else if (c.equals("fr.sz")) { return new org.xerial.snappy.SnappyFramedOutputStream(original_output); } else return null; // (fallback to no codec) })).orElse(original_output); } /** V simple utility - if we know it's JSON then use that otherwise use nothing * @param stage * @return */ public static String getExtension(final IStorageService.StorageStage stage) { if (IStorageService.StorageStage.raw == stage) { return ""; } else { return ".json"; } } /** Utility function to map the storage type/bucket to a base directory * @param bucket * @param stage * @return */ public static String getBasePath(final String root_path, final DataBucketBean bucket, final IStorageService.StorageStage stage, final Optional<String> job_name, final String buffer) { return Optional.of(Patterns.match().<String>andReturn() .when(__ -> stage == IStorageService.StorageStage.raw, __ -> IStorageService.STORED_DATA_SUFFIX_RAW_SECONDARY + buffer) .when(__ -> stage == IStorageService.StorageStage.json, __ -> IStorageService.STORED_DATA_SUFFIX_JSON_SECONDARY + buffer) .when(__ -> stage == IStorageService.StorageStage.processed, __ -> IStorageService.STORED_DATA_SUFFIX_PROCESSED_SECONDARY + buffer) .when(__ -> stage == IStorageService.StorageStage.transient_output, __ -> IStorageService.TRANSIENT_DATA_SUFFIX_SECONDARY + buffer + "/" + job_name.get()) //(job_name exists by construction in the transient output case) .otherwiseAssert()).map(s -> root_path + bucket.full_name() + s).get(); } /** Gets the time based suffix, or IStorageService.NO_TIME_SUFFIX if it's not temporal * @param bucket * @param stage * @return */ public static String getSuffix(final Date now, final DataBucketBean bucket, final IStorageService.StorageStage stage) { return Optionals.of(() -> bucket.data_schema().storage_schema()) .map(store -> getStorageSubSchema(store, stage)).map(ss -> ss.grouping_time_period()) .<String>map(period -> TimeUtils.getTimePeriod(period) .map(d -> TimeUtils.getTimeBasedSuffix(d, Optional.empty())) .validation(fail -> null, success -> DateUtils.formatDate(now, success))) .orElse(IStorageService.NO_TIME_SUFFIX); } /** Super low level utility to pick out the right storage sub-schema * @param store * @param stage * @return */ public static StorageSchemaBean.StorageSubSchemaBean getStorageSubSchema(final StorageSchemaBean store, final IStorageService.StorageStage stage) { return Patterns.match().<StorageSchemaBean.StorageSubSchemaBean>andReturn() .when(__ -> stage == IStorageService.StorageStage.raw, __ -> store.raw()) .when(__ -> stage == IStorageService.StorageStage.json, __ -> store.json()) .when(__ -> stage == IStorageService.StorageStage.processed, __ -> store.processed()) .when(__ -> stage == IStorageService.StorageStage.transient_output, __ -> getDefaultStorageSubSchema()) .otherwiseAssert(); } /** Just returns an empty storage sub-schema for transient output between jobs * @return a default/empty storage sub schema */ public static StorageSchemaBean.StorageSubSchemaBean getDefaultStorageSubSchema() { return BeanTemplateUtils.build(StorageSchemaBean.StorageSubSchemaBean.class) .with(StorageSchemaBean.StorageSubSchemaBean::enabled, true).done().get(); } }