Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.monitor.trackers; import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.api.java.tuple.Tuple; import org.apache.flink.configuration.Configuration; import org.apache.flink.statistics.HistogramConfig; import org.apache.flink.statistics.LocalStatisticPlugin; import org.apache.flink.statistics.StatisticTaskTrackerContext; import org.apache.flink.statistics.StatisticTaskTrackerContext.Type; import org.apache.flink.statistics.StatisticsRequest; import org.apache.flink.statistics.histogram.SlicedBloomFilter; import org.apache.flink.statistics.model.ApproxHistogramRecord; import org.apache.flink.statistics.model.RuntimeStatisticsRecord; import org.apache.flink.statistics.model.SimpleStatisticsKey; import org.apache.flink.statistics.model.StatisticsCommit; import org.apache.flink.statistics.model.StatisticsRecord; import org.apache.flink.util.HistogramUtil; /** * */ public class HistogramTaskTracker extends AbstractStatisticTaskTracker<Tuple> { protected static final Log LOG = LogFactory.getLog(HistogramTaskTracker.class); // Scalable Bloom filter used as presence indicator and to perform Linear // Counting protected SlicedBloomFilter<Serializable> filter; protected Map<Serializable, AtomicLong> topKMap; boolean empty; private Map<Serializable, AtomicLong> exactMap; protected int tau; protected long tuplecount; // parameter for extensions and evaluation protected boolean exact = false; protected long reportEvery = -1; protected long bloomFilterSize = 0; private int adaptiveSummarySizeInPercent; private int adaptiveTau; private int summarySize = 0; private int stage = 0; private int keyPos = -1; private Class<? extends Serializable> keyClass; private TypeSerializer serializer; private String taskName; private boolean dumpLocally; private int index; private String dumpPath; private RuntimeStatisticsRecord storedRecord; public static final String FIELD = "field"; public static final String TASK_HISTOGRAM = "taskhistogram"; private AggregatorThread aggregator; /** * new instantiations are memory intensive but necessary since the * transmission of histogram records is asynchronous. */ private void setupTracker() { this.topKMap = new HashMap<Serializable, AtomicLong>(); if (this.exact) { this.exactMap = new HashMap<Serializable, AtomicLong>(); } else { this.filter = new SlicedBloomFilter<Serializable>(this.bloomFilterSize, 0.01); } this.tuplecount = 0; } public HistogramTaskTracker(StatisticTaskTrackerContext context, LocalStatisticPlugin plugin, Configuration configuration) { super(context, plugin); HistogramConfig conf = new HistogramConfig(configuration); this.reportEvery = conf.getReportEvery(); this.serializer = conf.getTypeSerializer(); this.exact = conf.getExact(); this.bloomFilterSize = conf.getBloomFilterSize(); this.tau = conf.getTau(); this.keyClass = conf.getKeyClass(); this.adaptiveSummarySizeInPercent = conf.getAdaptiveSummarySize(); this.adaptiveTau = conf.getAdaptiveTau(); this.summarySize = conf.getSummarySize(); this.taskName = context.getTaskName(); this.dumpLocally = conf.isDumpLocally(); this.index = conf.getIndexString(); this.dumpPath = conf.getDumpPath(); this.empty = true; if (this.keyClass == null) { if (context.getType() == Type.INPUT) { throw new IllegalArgumentException( "Histograms on input channels are only available if you specify a Serializable class."); } else if (conf.getKeyClasses() == null) { LOG.warn( "You have not specified a Serializable class. Inefficient reflection-based copying will be used instead."); } } this.aggregator = new AggregatorThread(); this.aggregator.start(); this.aggregator.setTaskName(this.taskName); LOG.debug("Collecting histograms for " + this.getContext().getTaskName()); } @Override public void collect(Tuple tuple) { this.empty = false; if (this.bloomFilterSize == 0) { throw new RuntimeException("Tracker was not configured, please call addPos()"); } if (tuple == null) { return; } addToLocalHistogram(tuple); if (this.reportEvery > 0 && this.tuplecount % this.reportEvery == 0) { sendCurrentHistogram(); } } private StatisticsCommit getCurrentHistogram(StatisticTaskTrackerContext context) { LOG.info("Getting current histogram for " + this.keyPos + " class id " + this); ApproxHistogramRecord record = null; String key = null; try { key = createTaskKey(context); if (this.exact) { LOG.debug("send taskhistogram " + key + ", headSize: " + this.exactMap.size()); } else { LOG.debug("send taskhistogram " + key + ", headSize: " + this.topKMap.size() + ", headLimit: " + this.summarySize + ", linearCounting:" + this.filter.linearCounting()); restrictHead(this.topKMap, this.tau); } if (this.exact) { record = new ApproxHistogramRecord(context, this.keyPos, this.exactMap, this.tuplecount, this.serializer); } else { record = new ApproxHistogramRecord(context, this.keyPos, this.filter, this.topKMap, this.tuplecount, this.serializer); } setupTracker(); // reset fields, because compileStatRecords() may be // called multiple times record.setStage(this.stage); record.setFinal(true); // If called by the super class, this is the // final histogram. record.setTau(this.tau); this.empty = true; } catch (ArithmeticException e) { LOG.error("arithmetic exception: " + context); } return new StatisticsCommit(new SimpleStatisticsKey(key), record); } /** * Set the histogram head to a relative size given in percent of the size of * the local histogram. The tau value is determined by computing the average * and multiplying it by this.adaptiveTau. */ private void setAdaptiveHeadAndTau() { if (this.adaptiveSummarySizeInPercent > 0) { this.summarySize = (int) (this.topKMap.size() * this.adaptiveSummarySizeInPercent / 100); } if (this.adaptiveTau > 0) { long count = 0; for (AtomicLong f : this.topKMap.values()) { count += f.get(); } if (!this.topKMap.isEmpty()) { int average = (int) (count / this.topKMap.size()); this.tau = average * this.adaptiveTau; } } } /** * Remove the values from the histogram head whose frequencies are below * tau. */ public static void restrictHead(Map<Serializable, AtomicLong> topKMap, int tau) { // poor mans version for (Iterator<Entry<Serializable, AtomicLong>> eIt = topKMap.entrySet().iterator(); eIt.hasNext();) { Entry<Serializable, AtomicLong> e = eIt.next(); if (e.getValue().intValue() < tau) { eIt.remove(); } } } @Override public void compileResult(StatisticTaskTrackerContext context, LocalStatisticPlugin plugin) { if (!this.dumpLocally) { StatisticsCommit currentHistogram = getCurrentHistogram(context); this.aggregator.enqueueRecord((ApproxHistogramRecord) currentHistogram.getRecord()); this.aggregator.stopAccumulator(); ApproxHistogramRecord accumulator = this.aggregator.getAccumulator(); currentHistogram = new StatisticsCommit(currentHistogram.getKey(), accumulator); plugin.sendStatistics(currentHistogram); } else { String fileName = this.dumpPath + this.taskName + "_" + this.index + "_" + System.currentTimeMillis(); try { if (this.exact) { HistogramUtil.writeHashMapToFile(this.exactMap, fileName); } else { HistogramUtil.writeHashMapToFile(this.topKMap, fileName); } } catch (IOException ex) { } } } private final void addToLocalHistogram(Tuple tuple) { Serializable field = tuple.getField(this.keyPos); if (field == null) { return; } if (!this.exact) { /* pass Serializable to Bloom filter for presence indication */ this.filter.add(field); AtomicLong frequency = this.topKMap.get(field); if (frequency == null) { frequency = new AtomicLong(0L); this.topKMap.put(field, frequency); } this.topKMap.get(field).incrementAndGet(); } else { if (this.exactMap.get(field) == null) { this.exactMap.put(field, new AtomicLong(1)); } else { this.exactMap.get(field).incrementAndGet(); } } this.tuplecount++; } public static String createTaskKey(StatisticTaskTrackerContext context) { return context.getTaskName() + "-" + context.getType().toString() + context.getIOIndex() + "." + context.getIndexInSubGroup(); } public StatisticsRecord getStoredRecord() { return this.storedRecord; } @Override public String getStatName() { return StatisticsRequest.COLLECT_HISTOGRAM; } @Override public void addPos(int position) { if (this.keyPos < 0) { this.keyPos = position; LOG.debug("Collecting histograms at Serializable posititon " + this.keyPos); this.topKMap = new HashMap<Serializable, AtomicLong>(); if (this.exact) { this.exactMap = new HashMap<Serializable, AtomicLong>(); } else { this.filter = new SlicedBloomFilter<Serializable>(this.bloomFilterSize, 0.1); } this.tuplecount = 0; } else { LOG.error("histogram Serializable already set, ignoring requested position: " + position); } } protected final void sendCurrentHistogram() { ApproxHistogramRecord record = (ApproxHistogramRecord) getCurrentHistogram(this.getContext()).getRecord(); record.setStage(this.stage++); record.setFinal(false); this.aggregator.enqueueRecord(record); } /** */ private static class AggregatorThread extends Thread { ApproxHistogramRecord lastRecord = new ApproxHistogramRecord(); private boolean done = false; private boolean running = false; private BlockingQueue<ApproxHistogramRecord> toCombine = new LinkedBlockingQueue<ApproxHistogramRecord>(); /** this is the record to aggregate into */ private ApproxHistogramRecord accumulator; // debug private long counter = 0; private String taskName; @Override public synchronized void start() { this.running = true; super.start(); } public void stopAccumulator() { this.toCombine.offer(lastRecord); try { this.join(); } catch (InterruptedException e) { LOG.error(e, e); } } public boolean hasRecordsToAccumulate() { return !this.toCombine.isEmpty(); } @Override public void run() { while (true) { try { ApproxHistogramRecord take = this.toCombine.take(); if (take == lastRecord) { break; } accumulate(take); } catch (InterruptedException e) { LOG.error(e, e); } } // while(!this.toCombine.isEmpty()){ // try { // accumulate(this.toCombine.take()); // } catch (InterruptedException e) { // LOG.error(e, e); // } // } done = true; } private void accumulate(ApproxHistogramRecord current) { counter++; if (this.accumulator == null) { this.accumulator = current; this.accumulator.setUpperSummary(current.getSummary()); } else { // do the same kind of logic from TC histogram // Map<Serializable, AtomicLong> upperHead = upperHead(current.getSummary(), accumulator.getUpperSummary(), current.getBloomFilter(), accumulator.getBloomFilter()); // Merge lower head (named part) into the accumulator's named part: mergeInto(accumulator.getSummary(), current.getSummary()); current.getBloomFilter().disjunction(accumulator.getBloomFilter().getFilter()); // set values into accumulator... accumulator.setUpperSummary(upperHead); accumulator.setTupleCount(accumulator.getTupleCount() + current.getTupleCount()); current.clear(); } } public void enqueueRecord(ApproxHistogramRecord toCombine) { this.toCombine.offer(toCombine); } private void mergeInto(Map<Serializable, AtomicLong> mergeHead, Map<Serializable, AtomicLong> secondMap) { for (Map.Entry<Serializable, AtomicLong> e : secondMap.entrySet()) { Serializable Serializable = e.getKey(); long count = e.getValue().longValue(); AtomicLong mergeHeadVal = mergeHead.get(Serializable); if (mergeHeadVal != null) { mergeHeadVal.addAndGet(count); } else { mergeHead.put(Serializable, new AtomicLong(count)); } } } private Map<Serializable, AtomicLong> mergeLists(Map<Serializable, AtomicLong> firstMap, Map<Serializable, AtomicLong> secondMap) { Map<Serializable, AtomicLong> mergedHead = new HashMap<Serializable, AtomicLong>(); for (Map.Entry<Serializable, AtomicLong> e : firstMap.entrySet()) { Serializable Serializable = e.getKey(); long count = e.getValue().longValue(); AtomicLong mergedHeadVal = mergedHead.get(Serializable); if (mergedHeadVal != null) { mergedHeadVal.addAndGet(count); } else { mergedHead.put(Serializable, new AtomicLong(count)); } } for (Map.Entry<Serializable, AtomicLong> e : secondMap.entrySet()) { Serializable Serializable = e.getKey(); long count = e.getValue().longValue(); AtomicLong mergedHeadVal = mergedHead.get(Serializable); if (mergedHeadVal != null) { mergedHeadVal.addAndGet(count); } else { mergedHead.put(Serializable, new AtomicLong(count)); } } return mergedHead; } private Map<Serializable, AtomicLong> upperHead(Map<Serializable, AtomicLong> firstMap, Map<Serializable, AtomicLong> secondMap, SlicedBloomFilter<Serializable> firstFilter, SlicedBloomFilter<Serializable> secondFilter) { Map<Serializable, AtomicLong> upperHead = new HashMap<Serializable, AtomicLong>(); // put all keys into upperHead that appear in at least one local // histogram head, if key is already contained in // upperHead, add value upperHead = mergeLists(firstMap, secondMap); // if a key that has been added to upperHead, is not contained in // the a local histogram head, but is stored in // its the bloomfilter, then add minimum value of the respective // local histogram long firstMin = getMin(firstMap); long secondMin = getMin(secondMap); for (Map.Entry<Serializable, AtomicLong> headEntry : upperHead.entrySet()) { Serializable headSerializable = headEntry.getKey(); if (!firstMap.containsKey(headSerializable) && firstFilter.contains(headSerializable)) { headEntry.getValue().addAndGet(firstMin); } if (!secondMap.containsKey(headSerializable) && secondFilter.contains(headSerializable)) { headEntry.getValue().addAndGet(secondMin); } } return upperHead; } // minimal value in local histogram head private long getMin(Map<Serializable, AtomicLong> map) { if (map.isEmpty()) { return 0; } long min = Long.MAX_VALUE; for (Map.Entry<Serializable, AtomicLong> e : map.entrySet()) { long act = e.getValue().get(); if (act < min) { min = act; } } return min; } public ApproxHistogramRecord getAccumulator() { if (!this.running) { this.accumulator.setFinal(true); LOG.info("Timing: histogram finished"); } return accumulator; } public void setTaskName(String taskName) { this.taskName = taskName; } } }