org.apache.flink.monitor.trackers.HistogramTaskTracker.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.monitor.trackers.HistogramTaskTracker.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.monitor.trackers;

import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.statistics.HistogramConfig;
import org.apache.flink.statistics.LocalStatisticPlugin;
import org.apache.flink.statistics.StatisticTaskTrackerContext;
import org.apache.flink.statistics.StatisticTaskTrackerContext.Type;
import org.apache.flink.statistics.StatisticsRequest;
import org.apache.flink.statistics.histogram.SlicedBloomFilter;
import org.apache.flink.statistics.model.ApproxHistogramRecord;
import org.apache.flink.statistics.model.RuntimeStatisticsRecord;
import org.apache.flink.statistics.model.SimpleStatisticsKey;
import org.apache.flink.statistics.model.StatisticsCommit;
import org.apache.flink.statistics.model.StatisticsRecord;
import org.apache.flink.util.HistogramUtil;

/**
 * 
 */
public class HistogramTaskTracker extends AbstractStatisticTaskTracker<Tuple> {

    protected static final Log LOG = LogFactory.getLog(HistogramTaskTracker.class);

    // Scalable Bloom filter used as presence indicator and to perform Linear
    // Counting
    protected SlicedBloomFilter<Serializable> filter;

    protected Map<Serializable, AtomicLong> topKMap;
    boolean empty;

    private Map<Serializable, AtomicLong> exactMap;
    protected int tau;
    protected long tuplecount;

    // parameter for extensions and evaluation
    protected boolean exact = false;
    protected long reportEvery = -1;
    protected long bloomFilterSize = 0;
    private int adaptiveSummarySizeInPercent;
    private int adaptiveTau;
    private int summarySize = 0;
    private int stage = 0;

    private int keyPos = -1;
    private Class<? extends Serializable> keyClass;
    private TypeSerializer serializer;
    private String taskName;
    private boolean dumpLocally;
    private int index;
    private String dumpPath;

    private RuntimeStatisticsRecord storedRecord;

    public static final String FIELD = "field";
    public static final String TASK_HISTOGRAM = "taskhistogram";

    private AggregatorThread aggregator;

    /**
     * new instantiations are memory intensive but necessary since the
     * transmission of histogram records is asynchronous.
     */
    private void setupTracker() {
        this.topKMap = new HashMap<Serializable, AtomicLong>();
        if (this.exact) {
            this.exactMap = new HashMap<Serializable, AtomicLong>();
        } else {
            this.filter = new SlicedBloomFilter<Serializable>(this.bloomFilterSize, 0.01);
        }
        this.tuplecount = 0;
    }

    public HistogramTaskTracker(StatisticTaskTrackerContext context, LocalStatisticPlugin plugin,
            Configuration configuration) {
        super(context, plugin);

        HistogramConfig conf = new HistogramConfig(configuration);
        this.reportEvery = conf.getReportEvery();
        this.serializer = conf.getTypeSerializer();
        this.exact = conf.getExact();
        this.bloomFilterSize = conf.getBloomFilterSize();
        this.tau = conf.getTau();
        this.keyClass = conf.getKeyClass();
        this.adaptiveSummarySizeInPercent = conf.getAdaptiveSummarySize();
        this.adaptiveTau = conf.getAdaptiveTau();
        this.summarySize = conf.getSummarySize();
        this.taskName = context.getTaskName();
        this.dumpLocally = conf.isDumpLocally();
        this.index = conf.getIndexString();
        this.dumpPath = conf.getDumpPath();
        this.empty = true;

        if (this.keyClass == null) {
            if (context.getType() == Type.INPUT) {
                throw new IllegalArgumentException(
                        "Histograms on input channels are only available if you specify a Serializable class.");
            } else if (conf.getKeyClasses() == null) {
                LOG.warn(
                        "You have not specified a Serializable class. Inefficient reflection-based copying will be used instead.");
            }
        }

        this.aggregator = new AggregatorThread();
        this.aggregator.start();
        this.aggregator.setTaskName(this.taskName);
        LOG.debug("Collecting histograms for " + this.getContext().getTaskName());
    }

    @Override
    public void collect(Tuple tuple) {
        this.empty = false;
        if (this.bloomFilterSize == 0) {
            throw new RuntimeException("Tracker was not configured, please call addPos()");
        }
        if (tuple == null) {
            return;
        }

        addToLocalHistogram(tuple);
        if (this.reportEvery > 0 && this.tuplecount % this.reportEvery == 0) {
            sendCurrentHistogram();
        }
    }

    private StatisticsCommit getCurrentHistogram(StatisticTaskTrackerContext context) {
        LOG.info("Getting current histogram for " + this.keyPos + " class id " + this);

        ApproxHistogramRecord record = null;
        String key = null;
        try {
            key = createTaskKey(context);

            if (this.exact) {
                LOG.debug("send taskhistogram " + key + ", headSize: " + this.exactMap.size());
            } else {
                LOG.debug("send taskhistogram " + key + ", headSize: " + this.topKMap.size() + ", headLimit: "
                        + this.summarySize + ", linearCounting:" + this.filter.linearCounting());
                restrictHead(this.topKMap, this.tau);

            }

            if (this.exact) {
                record = new ApproxHistogramRecord(context, this.keyPos, this.exactMap, this.tuplecount,
                        this.serializer);
            } else {
                record = new ApproxHistogramRecord(context, this.keyPos, this.filter, this.topKMap, this.tuplecount,
                        this.serializer);
            }
            setupTracker(); // reset fields, because compileStatRecords() may be
            // called multiple times
            record.setStage(this.stage);
            record.setFinal(true); // If called by the super class, this is the
            // final histogram.
            record.setTau(this.tau);
            this.empty = true;
        } catch (ArithmeticException e) {
            LOG.error("arithmetic exception: " + context);
        }
        return new StatisticsCommit(new SimpleStatisticsKey(key), record);
    }

    /**
     * Set the histogram head to a relative size given in percent of the size of
     * the local histogram. The tau value is determined by computing the average
     * and multiplying it by this.adaptiveTau.
     */
    private void setAdaptiveHeadAndTau() {
        if (this.adaptiveSummarySizeInPercent > 0) {
            this.summarySize = (int) (this.topKMap.size() * this.adaptiveSummarySizeInPercent / 100);
        }

        if (this.adaptiveTau > 0) {
            long count = 0;
            for (AtomicLong f : this.topKMap.values()) {
                count += f.get();
            }
            if (!this.topKMap.isEmpty()) {
                int average = (int) (count / this.topKMap.size());
                this.tau = average * this.adaptiveTau;
            }
        }
    }

    /**
     * Remove the values from the histogram head whose frequencies are below
     * tau.
     */
    public static void restrictHead(Map<Serializable, AtomicLong> topKMap, int tau) {
        // poor mans version
        for (Iterator<Entry<Serializable, AtomicLong>> eIt = topKMap.entrySet().iterator(); eIt.hasNext();) {
            Entry<Serializable, AtomicLong> e = eIt.next();
            if (e.getValue().intValue() < tau) {
                eIt.remove();
            }
        }
    }

    @Override
    public void compileResult(StatisticTaskTrackerContext context, LocalStatisticPlugin plugin) {
        if (!this.dumpLocally) {
            StatisticsCommit currentHistogram = getCurrentHistogram(context);

            this.aggregator.enqueueRecord((ApproxHistogramRecord) currentHistogram.getRecord());
            this.aggregator.stopAccumulator();

            ApproxHistogramRecord accumulator = this.aggregator.getAccumulator();
            currentHistogram = new StatisticsCommit(currentHistogram.getKey(), accumulator);
            plugin.sendStatistics(currentHistogram);
        } else {
            String fileName = this.dumpPath + this.taskName + "_" + this.index + "_" + System.currentTimeMillis();

            try {
                if (this.exact) {
                    HistogramUtil.writeHashMapToFile(this.exactMap, fileName);
                } else {
                    HistogramUtil.writeHashMapToFile(this.topKMap, fileName);
                }
            } catch (IOException ex) {
            }
        }
    }

    private final void addToLocalHistogram(Tuple tuple) {
        Serializable field = tuple.getField(this.keyPos);
        if (field == null) {
            return;
        }
        if (!this.exact) {
            /* pass Serializable to Bloom filter for presence indication */
            this.filter.add(field);
            AtomicLong frequency = this.topKMap.get(field);
            if (frequency == null) {
                frequency = new AtomicLong(0L);
                this.topKMap.put(field, frequency);
            }
            this.topKMap.get(field).incrementAndGet();

        } else {
            if (this.exactMap.get(field) == null) {
                this.exactMap.put(field, new AtomicLong(1));
            } else {
                this.exactMap.get(field).incrementAndGet();
            }
        }
        this.tuplecount++;

    }

    public static String createTaskKey(StatisticTaskTrackerContext context) {
        return context.getTaskName() + "-" + context.getType().toString() + context.getIOIndex() + "."
                + context.getIndexInSubGroup();
    }

    public StatisticsRecord getStoredRecord() {
        return this.storedRecord;
    }

    @Override
    public String getStatName() {
        return StatisticsRequest.COLLECT_HISTOGRAM;
    }

    @Override
    public void addPos(int position) {
        if (this.keyPos < 0) {
            this.keyPos = position;
            LOG.debug("Collecting histograms at Serializable posititon " + this.keyPos);
            this.topKMap = new HashMap<Serializable, AtomicLong>();
            if (this.exact) {
                this.exactMap = new HashMap<Serializable, AtomicLong>();
            } else {
                this.filter = new SlicedBloomFilter<Serializable>(this.bloomFilterSize, 0.1);
            }
            this.tuplecount = 0;

        } else {
            LOG.error("histogram Serializable already set, ignoring requested position: " + position);
        }
    }

    protected final void sendCurrentHistogram() {
        ApproxHistogramRecord record = (ApproxHistogramRecord) getCurrentHistogram(this.getContext()).getRecord();

        record.setStage(this.stage++);
        record.setFinal(false);
        this.aggregator.enqueueRecord(record);
    }

    /** */
    private static class AggregatorThread extends Thread {

        ApproxHistogramRecord lastRecord = new ApproxHistogramRecord();

        private boolean done = false;
        private boolean running = false;
        private BlockingQueue<ApproxHistogramRecord> toCombine = new LinkedBlockingQueue<ApproxHistogramRecord>();

        /** this is the record to aggregate into */
        private ApproxHistogramRecord accumulator;
        // debug
        private long counter = 0;
        private String taskName;

        @Override
        public synchronized void start() {
            this.running = true;
            super.start();
        }

        public void stopAccumulator() {
            this.toCombine.offer(lastRecord);
            try {
                this.join();
            } catch (InterruptedException e) {
                LOG.error(e, e);
            }
        }

        public boolean hasRecordsToAccumulate() {
            return !this.toCombine.isEmpty();
        }

        @Override
        public void run() {
            while (true) {
                try {
                    ApproxHistogramRecord take = this.toCombine.take();
                    if (take == lastRecord) {
                        break;
                    }
                    accumulate(take);
                } catch (InterruptedException e) {
                    LOG.error(e, e);
                }
            }
            //         while(!this.toCombine.isEmpty()){
            //            try {
            //               accumulate(this.toCombine.take());
            //            } catch (InterruptedException e) {
            //               LOG.error(e, e);
            //            }
            //         }
            done = true;
        }

        private void accumulate(ApproxHistogramRecord current) {
            counter++;
            if (this.accumulator == null) {
                this.accumulator = current;
                this.accumulator.setUpperSummary(current.getSummary());
            } else {
                // do the same kind of logic from TC histogram
                //
                Map<Serializable, AtomicLong> upperHead = upperHead(current.getSummary(),
                        accumulator.getUpperSummary(), current.getBloomFilter(), accumulator.getBloomFilter());
                // Merge lower head (named part) into the accumulator's named part:
                mergeInto(accumulator.getSummary(), current.getSummary());

                current.getBloomFilter().disjunction(accumulator.getBloomFilter().getFilter());
                // set values into accumulator...
                accumulator.setUpperSummary(upperHead);
                accumulator.setTupleCount(accumulator.getTupleCount() + current.getTupleCount());

                current.clear();
            }
        }

        public void enqueueRecord(ApproxHistogramRecord toCombine) {
            this.toCombine.offer(toCombine);
        }

        private void mergeInto(Map<Serializable, AtomicLong> mergeHead, Map<Serializable, AtomicLong> secondMap) {
            for (Map.Entry<Serializable, AtomicLong> e : secondMap.entrySet()) {
                Serializable Serializable = e.getKey();
                long count = e.getValue().longValue();
                AtomicLong mergeHeadVal = mergeHead.get(Serializable);
                if (mergeHeadVal != null) {
                    mergeHeadVal.addAndGet(count);
                } else {
                    mergeHead.put(Serializable, new AtomicLong(count));
                }
            }
        }

        private Map<Serializable, AtomicLong> mergeLists(Map<Serializable, AtomicLong> firstMap,
                Map<Serializable, AtomicLong> secondMap) {
            Map<Serializable, AtomicLong> mergedHead = new HashMap<Serializable, AtomicLong>();

            for (Map.Entry<Serializable, AtomicLong> e : firstMap.entrySet()) {
                Serializable Serializable = e.getKey();
                long count = e.getValue().longValue();
                AtomicLong mergedHeadVal = mergedHead.get(Serializable);
                if (mergedHeadVal != null) {
                    mergedHeadVal.addAndGet(count);
                } else {
                    mergedHead.put(Serializable, new AtomicLong(count));
                }
            }

            for (Map.Entry<Serializable, AtomicLong> e : secondMap.entrySet()) {
                Serializable Serializable = e.getKey();
                long count = e.getValue().longValue();
                AtomicLong mergedHeadVal = mergedHead.get(Serializable);
                if (mergedHeadVal != null) {
                    mergedHeadVal.addAndGet(count);
                } else {
                    mergedHead.put(Serializable, new AtomicLong(count));
                }
            }
            return mergedHead;
        }

        private Map<Serializable, AtomicLong> upperHead(Map<Serializable, AtomicLong> firstMap,
                Map<Serializable, AtomicLong> secondMap, SlicedBloomFilter<Serializable> firstFilter,
                SlicedBloomFilter<Serializable> secondFilter) {
            Map<Serializable, AtomicLong> upperHead = new HashMap<Serializable, AtomicLong>();
            // put all keys into upperHead that appear in at least one local
            // histogram head, if key is already contained in
            // upperHead, add value
            upperHead = mergeLists(firstMap, secondMap);

            // if a key that has been added to upperHead, is not contained in
            // the a local histogram head, but is stored in
            // its the bloomfilter, then add minimum value of the respective
            // local histogram
            long firstMin = getMin(firstMap);
            long secondMin = getMin(secondMap);

            for (Map.Entry<Serializable, AtomicLong> headEntry : upperHead.entrySet()) {
                Serializable headSerializable = headEntry.getKey();
                if (!firstMap.containsKey(headSerializable) && firstFilter.contains(headSerializable)) {
                    headEntry.getValue().addAndGet(firstMin);
                }

                if (!secondMap.containsKey(headSerializable) && secondFilter.contains(headSerializable)) {
                    headEntry.getValue().addAndGet(secondMin);
                }
            }

            return upperHead;
        }

        // minimal value in local histogram head
        private long getMin(Map<Serializable, AtomicLong> map) {
            if (map.isEmpty()) {
                return 0;
            }
            long min = Long.MAX_VALUE;
            for (Map.Entry<Serializable, AtomicLong> e : map.entrySet()) {
                long act = e.getValue().get();
                if (act < min) {
                    min = act;
                }
            }
            return min;
        }

        public ApproxHistogramRecord getAccumulator() {
            if (!this.running) {
                this.accumulator.setFinal(true);
                LOG.info("Timing: histogram finished");
            }
            return accumulator;
        }

        public void setTaskName(String taskName) {
            this.taskName = taskName;
        }

    }

}