Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flume.sink.hive; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.reflect.TypeToken; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.gson.Gson; import org.apache.commons.lang.StringUtils; import org.apache.flume.Channel; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.EventDeliveryException; import org.apache.flume.Transaction; import org.apache.flume.channel.MemoryChannel; import org.apache.flume.conf.Configurable; import org.apache.flume.event.JSONEvent; import org.apache.flume.formatter.output.BucketPath; import org.apache.flume.instrumentation.SinkCounter; import org.apache.flume.sink.AbstractSink; import org.apache.flume.source.kafka.KafkaSource; import org.apache.flume.source.kafka.KafkaSourceConstants; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hive.hcatalog.streaming.HiveEndPoint; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.clients.consumer.OffsetAndMetadata; import org.apache.kafka.common.TopicPartition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.lang.reflect.Type; import java.nio.charset.Charset; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; public class HiveSink extends AbstractSink implements Configurable { private static final Logger LOG = LoggerFactory.getLogger(HiveSink.class); private static final int DEFAULT_MAXOPENCONNECTIONS = 500; private static final int DEFAULT_TXNSPERBATCH = 100; private static final int DEFAULT_BATCHSIZE = 15000; private static final int DEFAULT_CALLTIMEOUT = 10000; private static final int DEFAULT_IDLETIMEOUT = 0; private static final int DEFAULT_HEARTBEATINTERVAL = 240; // seconds private Map<HiveEndPoint, HiveWriter> allWriters; private SinkCounter sinkCounter; private volatile int idleTimeout; private String metaStoreUri; private String proxyUser; private String database; private String table; private List<String> partitionVals; private Integer txnsPerBatchAsk; private Integer batchSize; private Integer maxOpenConnections; private boolean autoCreatePartitions; private String serializerType; private HiveEventSerializer serializer; private String partitionGist; private String partitionPattern; /** * Default timeout for blocking I/O calls in HiveWriter */ private Integer callTimeout; private Integer heartBeatInterval; private ExecutorService callTimeoutPool; private boolean useLocalTime; private TimeZone timeZone; private boolean needRounding; private int roundUnit; private Integer roundValue; private Timer heartBeatTimer = new Timer(); private AtomicBoolean timeToSendHeartBeat = new AtomicBoolean(false); // Control kafka consumer offset private String kafkaConsumerId; private KafkaConsumer kafkaConsumer; private Map<TopicPartition, OffsetAndMetadata> tpAndOffsetMetadata; @VisibleForTesting Map<HiveEndPoint, HiveWriter> getAllWriters() { return allWriters; } // read configuration and setup thresholds @Override public void configure(Context context) { metaStoreUri = context.getString(Config.HIVE_METASTORE); if (metaStoreUri == null) { throw new IllegalArgumentException( Config.HIVE_METASTORE + " config setting is not " + "specified for sink " + getName()); } if (metaStoreUri.equalsIgnoreCase("null")) { // for testing support metaStoreUri = null; } proxyUser = context.getString("hive.proxyUser"); // context.getString("hive.proxyUser"); not supported by hive api yet database = context.getString(Config.HIVE_DATABASE); if (database == null) { throw new IllegalArgumentException( Config.HIVE_DATABASE + " config setting is not " + "specified for sink " + getName()); } table = context.getString(Config.HIVE_TABLE); String partitions = context.getString(Config.HIVE_PARTITION); if (partitions != null) { partitionVals = Arrays.asList(partitions.split(",")); } else { partitionGist = context.getString(Config.HIVE_PARTITION_GIST); partitionPattern = context.getString(Config.HIVE_PARTITION_PATTERN); } txnsPerBatchAsk = context.getInteger(Config.HIVE_TXNS_PER_BATCH_ASK, DEFAULT_TXNSPERBATCH); if (txnsPerBatchAsk < 0) { LOG.warn(getName() + ". hive.txnsPerBatchAsk must be positive number. Defaulting to " + DEFAULT_TXNSPERBATCH); txnsPerBatchAsk = DEFAULT_TXNSPERBATCH; } batchSize = context.getInteger(Config.BATCH_SIZE, DEFAULT_BATCHSIZE); if (batchSize < 0) { LOG.warn(getName() + ". batchSize must be positive number. Defaulting to " + DEFAULT_BATCHSIZE); batchSize = DEFAULT_BATCHSIZE; } idleTimeout = context.getInteger(Config.IDLE_TIMEOUT, DEFAULT_IDLETIMEOUT); if (idleTimeout < 0) { LOG.warn(getName() + ". idleTimeout must be positive number. Defaulting to " + DEFAULT_IDLETIMEOUT); idleTimeout = DEFAULT_IDLETIMEOUT; } callTimeout = context.getInteger(Config.CALL_TIMEOUT, DEFAULT_CALLTIMEOUT); if (callTimeout < 0) { LOG.warn(getName() + ". callTimeout must be positive number. Defaulting to " + DEFAULT_CALLTIMEOUT); callTimeout = DEFAULT_CALLTIMEOUT; } heartBeatInterval = context.getInteger(Config.HEART_BEAT_INTERVAL, DEFAULT_HEARTBEATINTERVAL); if (heartBeatInterval < 0) { LOG.warn(getName() + ". heartBeatInterval must be positive number. Defaulting to " + DEFAULT_HEARTBEATINTERVAL); heartBeatInterval = DEFAULT_HEARTBEATINTERVAL; } maxOpenConnections = context.getInteger(Config.MAX_OPEN_CONNECTIONS, DEFAULT_MAXOPENCONNECTIONS); autoCreatePartitions = context.getBoolean("autoCreatePartitions", true); // Timestamp processing useLocalTime = context.getBoolean(Config.USE_LOCAL_TIME_STAMP, false); String tzName = context.getString(Config.TIME_ZONE); timeZone = (tzName == null) ? null : TimeZone.getTimeZone(tzName); needRounding = context.getBoolean(Config.ROUND, false); if (StringUtils.isNotBlank(context.getString(Config.KAFKA_CONSUMER_IDENTIFY))) { kafkaConsumerId = context.getString(Config.KAFKA_CONSUMER_IDENTIFY); tpAndOffsetMetadata = new HashMap<TopicPartition, OffsetAndMetadata>(); } String unit = context.getString(Config.ROUND_UNIT, Config.MINUTE); if (unit.equalsIgnoreCase(Config.HOUR)) { this.roundUnit = Calendar.HOUR_OF_DAY; } else if (unit.equalsIgnoreCase(Config.MINUTE)) { this.roundUnit = Calendar.MINUTE; } else if (unit.equalsIgnoreCase(Config.SECOND)) { this.roundUnit = Calendar.SECOND; } else { LOG.warn(getName() + ". Rounding unit is not valid, please set one of " + "minute, hour or second. Rounding will be disabled"); needRounding = false; } this.roundValue = context.getInteger(Config.ROUND_VALUE, 1); if (roundUnit == Calendar.SECOND || roundUnit == Calendar.MINUTE) { Preconditions.checkArgument(roundValue > 0 && roundValue <= 60, "Round value must be > 0 and <= 60"); } else if (roundUnit == Calendar.HOUR_OF_DAY) { Preconditions.checkArgument(roundValue > 0 && roundValue <= 24, "Round value must be > 0 and <= 24"); } // Serializer serializerType = context.getString(Config.SERIALIZER, ""); if (serializerType.isEmpty()) { throw new IllegalArgumentException( "serializer config setting is not " + "specified for sink " + getName()); } serializer = createSerializer(serializerType); serializer.configure(context); Preconditions.checkArgument(batchSize > 0, "batchSize must be greater than 0"); if (sinkCounter == null) { sinkCounter = new SinkCounter(getName()); } } @VisibleForTesting protected SinkCounter getCounter() { return sinkCounter; } private HiveEventSerializer createSerializer(String serializerName) { if (serializerName.compareToIgnoreCase(HiveDelimitedTextSerializer.ALIAS) == 0 || serializerName.compareTo(HiveDelimitedTextSerializer.class.getName()) == 0) { return new HiveDelimitedTextSerializer(); } else if (serializerName.compareToIgnoreCase(HiveJsonSerializer.ALIAS) == 0 || serializerName.compareTo(HiveJsonSerializer.class.getName()) == 0) { return new HiveJsonSerializer(); } try { return (HiveEventSerializer) Class.forName(serializerName).newInstance(); } catch (Exception e) { throw new IllegalArgumentException( "Unable to instantiate serializer: " + serializerName + " on sink: " + getName(), e); } } /** * Pull events out of channel, find corresponding HiveWriter and write to it. * Take at most batchSize events per Transaction. <br/> * This method is not thread safe. */ public Status process() throws EventDeliveryException { // writers used in this Txn Channel channel = getChannel(); Transaction transaction = channel.getTransaction(); transaction.begin(); boolean success = false; try { // 1 Enable Heart Beats if (timeToSendHeartBeat.compareAndSet(true, false)) { enableHeartBeatOnAllWriters(); } // 2 Drain Batch int txnEventCount = drainOneBatch(channel); transaction.commit(); success = true; // 3 Update Counters if (txnEventCount < 1) { return Status.BACKOFF; } else { return Status.READY; } } catch (InterruptedException err) { LOG.warn(getName() + ": Thread was interrupted.", err); return Status.BACKOFF; } catch (Exception e) { throw new EventDeliveryException(e); } finally { if (!success) { transaction.rollback(); } transaction.close(); } } // Drains one batch of events from Channel into Hive private int drainOneBatch(Channel channel) throws HiveWriter.Failure, InterruptedException { final String batchUUID = UUID.randomUUID().toString(); int txnEventCount = 0; try { Map<HiveEndPoint, HiveWriter> activeWriters = Maps.newHashMap(); for (; txnEventCount < batchSize; ++txnEventCount) { // 0) Read event from Channel Event event = channel.take(); if (event == null) { break; } Map<String, String> headers = event.getHeaders(); String body = new String(event.getBody(), Charset.forName("UTF-8")); Gson gson = new Gson(); Type type = new TypeToken<Map<String, String>>() { }.getType(); Map<String, String> bodys = gson.fromJson(body, type); String topic = headers.get(KafkaSourceConstants.TYPE_HEADER); if (table == null && (table = topic) == null) { throw new IllegalArgumentException( Config.HIVE_TABLE + " config setting is not " + "specified for sink " + getName()); } //1) Create end point by substituting place holders HiveEndPoint endPoint = makeEndPoint(metaStoreUri, database, table, partitionVals, event.getHeaders(), bodys, timeZone, needRounding, roundUnit, roundValue, useLocalTime); //2) Create or reuse Writer HiveWriter writer = getOrCreateWriter(activeWriters, endPoint); //3) Write LOG.debug("{} : Writing event to {}", getName(), endPoint); writer.write(event); // For each partition store next offset that is going to be read. tpAndOffsetMetadata.put( new TopicPartition(headers.get(KafkaSourceConstants.TYPE_HEADER), Integer.valueOf(headers.get(KafkaSourceConstants.PARTITION_HEADER))), new OffsetAndMetadata(Long.valueOf(headers.get(KafkaSourceConstants.OFFSET_HEADER)), batchUUID)); } // for //4) Update counters if (txnEventCount == 0) { sinkCounter.incrementBatchEmptyCount(); } else if (txnEventCount == batchSize) { sinkCounter.incrementBatchCompleteCount(); } else { sinkCounter.incrementBatchUnderflowCount(); } sinkCounter.addToEventDrainAttemptCount(txnEventCount); // 5) Flush all Writers for (HiveWriter writer : activeWriters.values()) { writer.flush(true); } if (!activeWriters.values().isEmpty()) { //update kafka consumer offset if (kafkaConsumer == null && kafkaConsumerId != null) { kafkaConsumer = KafkaSource.getConsumerById(kafkaConsumerId); } else if (kafkaConsumer != null && !tpAndOffsetMetadata.isEmpty()) { kafkaConsumer.commitSyncThreadSafe(tpAndOffsetMetadata); } } sinkCounter.addToEventDrainSuccessCount(txnEventCount); return txnEventCount; } catch (HiveWriter.Failure e) { // in case of error we close all TxnBatches to start clean next time LOG.warn(getName() + " : " + e.getMessage(), e); abortAllWriters(); closeAllWriters(); throw e; } } private void enableHeartBeatOnAllWriters() { for (HiveWriter writer : allWriters.values()) { writer.setHearbeatNeeded(); } } private HiveWriter getOrCreateWriter(Map<HiveEndPoint, HiveWriter> activeWriters, HiveEndPoint endPoint) throws HiveWriter.ConnectException, InterruptedException { try { HiveWriter writer = allWriters.get(endPoint); if (writer == null) { LOG.info(getName() + ": Creating Writer to Hive end point : " + endPoint); writer = new HiveWriter(endPoint, txnsPerBatchAsk, autoCreatePartitions, callTimeout, callTimeoutPool, proxyUser, serializer, sinkCounter); sinkCounter.incrementConnectionCreatedCount(); if (allWriters.size() > maxOpenConnections) { int retired = closeIdleWriters(); if (retired == 0) { closeEldestWriter(); } } allWriters.put(endPoint, writer); activeWriters.put(endPoint, writer); } else { if (activeWriters.get(endPoint) == null) { activeWriters.put(endPoint, writer); } } return writer; } catch (HiveWriter.ConnectException e) { sinkCounter.incrementConnectionFailedCount(); throw e; } } private HiveEndPoint makeEndPoint(String metaStoreUri, String database, String table, List<String> partVals, Map<String, String> headers, Map<String, String> bodys, TimeZone timeZone, boolean needRounding, int roundUnit, Integer roundValue, boolean useLocalTime) { ArrayList<String> realPartVals = Lists.newArrayList(); if (partVals == null) { if (partitionGist == null || partitionPattern == null) { return new HiveEndPoint(metaStoreUri, database, table, null); } else { SimpleDateFormat sdf = new SimpleDateFormat(Config.DATE_FORMAT); String gist = BucketPath.escapeString(partitionGist, bodys); try { Date d = sdf.parse(gist); List<String> patternList = Arrays.asList(partitionPattern.split(",")); for (String pattern : patternList) { SimpleDateFormat s = new SimpleDateFormat(pattern); realPartVals.add(s.format(d)); } } catch (ParseException e) { LOG.error("Error date format: " + gist, e); } } } else { for (String partVal : partVals) { realPartVals.add(BucketPath.escapeString(partVal, headers, timeZone, needRounding, roundUnit, roundValue, useLocalTime)); } } return new HiveEndPoint(metaStoreUri, database, table, realPartVals); } /** * Locate writer that has not been used for longest time and retire it */ private void closeEldestWriter() throws InterruptedException { long oldestTimeStamp = System.currentTimeMillis(); HiveEndPoint eldest = null; for (Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) { if (entry.getValue().getLastUsed() < oldestTimeStamp) { eldest = entry.getKey(); oldestTimeStamp = entry.getValue().getLastUsed(); } } try { sinkCounter.incrementConnectionCreatedCount(); LOG.info(getName() + ": Closing least used Writer to Hive EndPoint : " + eldest); allWriters.remove(eldest).close(); } catch (InterruptedException e) { LOG.warn(getName() + ": Interrupted when attempting to close writer for end point: " + eldest, e); throw e; } } /** * Locate all writers past idle timeout and retire them * @return number of writers retired */ private int closeIdleWriters() throws InterruptedException { int count = 0; long now = System.currentTimeMillis(); ArrayList<HiveEndPoint> retirees = Lists.newArrayList(); //1) Find retirement candidates for (Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) { if (now - entry.getValue().getLastUsed() > idleTimeout) { ++count; retirees.add(entry.getKey()); } } //2) Retire them for (HiveEndPoint ep : retirees) { sinkCounter.incrementConnectionClosedCount(); LOG.info(getName() + ": Closing idle Writer to Hive end point : {}", ep); allWriters.remove(ep).close(); } return count; } /** * Closes all writers and remove them from cache * @return number of writers retired */ private void closeAllWriters() throws InterruptedException { //1) Retire writers for (Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) { entry.getValue().close(); } //2) Clear cache allWriters.clear(); } /** * Abort current Txn on all writers * @return number of writers retired */ private void abortAllWriters() throws InterruptedException { for (Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) { entry.getValue().abort(); } } @Override public void stop() { // do not constrain close() calls with a timeout for (Entry<HiveEndPoint, HiveWriter> entry : allWriters.entrySet()) { try { HiveWriter w = entry.getValue(); w.close(); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } } // shut down all thread pools callTimeoutPool.shutdown(); try { while (callTimeoutPool.isTerminated() == false) { callTimeoutPool.awaitTermination(Math.max(DEFAULT_CALLTIMEOUT, callTimeout), TimeUnit.MILLISECONDS); } } catch (InterruptedException ex) { LOG.warn(getName() + ":Shutdown interrupted on " + callTimeoutPool, ex); } callTimeoutPool = null; allWriters.clear(); allWriters = null; sinkCounter.stop(); super.stop(); LOG.info("Hive Sink {} stopped", getName()); } @Override public void start() { String timeoutName = "hive-" + getName() + "-call-runner-%d"; // call timeout pool needs only 1 thd as sink is effectively single threaded callTimeoutPool = Executors.newFixedThreadPool(1, new ThreadFactoryBuilder().setNameFormat(timeoutName).build()); this.allWriters = Maps.newHashMap(); sinkCounter.start(); super.start(); setupHeartBeatTimer(); LOG.info(getName() + ": Hive Sink {} started", getName()); } private void setupHeartBeatTimer() { if (heartBeatInterval > 0) { heartBeatTimer.schedule(new TimerTask() { @Override public void run() { timeToSendHeartBeat.set(true); setupHeartBeatTimer(); } }, heartBeatInterval * 1000); } } @Override public String toString() { return "{ Sink type:" + getClass().getSimpleName() + ", name:" + getName() + " }"; } }