org.openmainframe.ade.scores.LastSeenScorer.java Source code

Java tutorial

Introduction

Here is the source code for org.openmainframe.ade.scores.LastSeenScorer.java

Source

/*
     
Copyright IBM Corp. 2010, 2016
This file is part of Anomaly Detection Engine for Linux Logs (ADE).
    
ADE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
ADE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with ADE.  If not, see <http://www.gnu.org/licenses/>.
     
*/
package org.openmainframe.ade.scores;

import java.io.PrintStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.apache.commons.lang3.StringUtils;
import org.openmainframe.ade.data.IAnalyzedInterval;
import org.openmainframe.ade.data.IAnalyzedMessageSummary;
import org.openmainframe.ade.exceptions.AdeException;
import org.openmainframe.ade.exceptions.AdeFlowException;
import org.openmainframe.ade.impl.PropertyAnnotation.Property;
import org.openmainframe.ade.scoringApi.MessageScorer;
import org.openmainframe.ade.scoringApi.StatisticsChart;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Calculate scores for the deltas belonging to a message id. A delta is defined as the
 * time difference in seconds between two of the same message instances. (ie. two messages
 * with the same message id) For example, if we have some message instance X with id MSGID 
 * appearing at time T1 then the delta value is T1 - T2 where T2 < T1 and T2 is the time of
 * the MOST RECENT message instance Y with id MSGID. These deltas are used to compute log
 * probabilities which gets stored in the StatisticsChart.
 */
public class LastSeenScorer extends MessageScorer {
    /**
     * Default logger for this class.
     */
    private static final Logger logger = LoggerFactory.getLogger(LastSeenScorer.class);
    /**
     * The serialized ID.
     */
    private static final long serialVersionUID = 1L;

    private static final double DEFAULT_FOR_NEW = 0d;
    @Property(key = "minimalValueForTallesBin", help = "Minimal count for most common bin.  Less than this and no model will be created.", required = false)
    private int m_minimalMax = 10;
    @Property(key = "minimalConcentration", help = "minimal required average points per bin", required = false)
    private double m_minimalConcentration = 2.0;

    /**
     * Given a sequence of points in R1 (e.g. times) x1 <= x2 ... <= xn (the
     * input is the distances [xi+1 - xi]) Look whether there are typical
     * differences [xi+1 - xi] (up to 3 modes) build clusters around the
     * mode. The likelihood in each cluster is then by Chebychev inequality
     * a new point will be considered anomaly if there is a significant
     * difference between the probability in the training points and the
     * point in question.
     */
    private class PerodicityBounder implements Serializable {

        /**
         * Stores the value of log(.5).
         */
        private final double m_logHalf = Math.log(0.5);
        /**
         * The serial ID.
         */
        private static final long serialVersionUID = 1L;

        /**
         * Threshold for the minimal points per cluster.
         */
        int mMinimalPointsPerCluster;
        /**
         * For determining if we have used this scorer during training.
         */
        boolean mTrained;

        /**
         * Maps the time difference in seconds from the most recently seen message instance since some message 
         * instance x with the same message id to the number of times this time difference has been seen.
         */
        TreeMap<Integer, Integer> mPointDifferences;
        /**
         * The total number of points we have added.
         */
        int mPoints;

        /**
         * Maps point/delta/ values to its score where score is the log of the ratio between max value 
         * and a point's value. (max is the time difference that occurs the most in mPointDifferences and a 
         * point's value is the number of occurrences for this time difference/point).
         */
        private TreeMap<Integer, Double> m_pointScores;
        /**
         * Log of the max delta/point value.
         */
        private double m_lMax = 0;
        /**
         * The log of ratio between the max point/delta and the total number of points/deltas.
         */
        private double m_llMax = 0;

        /**
         * Default constructor for initializing member variables.
         */
        public PerodicityBounder() {
            mMinimalPointsPerCluster = 10;
            mTrained = false;
            mPointDifferences = new TreeMap<Integer, Integer>();
            m_pointScores = null;
            mPoints = 0;
        }

        /**
         * Add a set of points. i.e. the time difference in seconds between two of the same message
         * instances. 
         * @param deltas change in seconds between two of of the same message instances where if MSG1 
         * occurred at time T then the other message instance time is less than time T and is the 
         * MOST RECENTLY seen message instance with this message id.
         */
        public void addPoints(Integer[] deltas) {
            if (deltas != null) {
                for (int delta : deltas) {
                    addPoint(delta);
                }
            }
        }

        /**
         * Start training if the number if the total number of points
         * we have seen is greater than the minimal amount of points per cluster. 
         * When training is done we clear the point differences.
         * @param name the message id.
         */
        public void train(String name) {
            assert mTrained == false;
            assert mPointDifferences.size() > 0;
            if (mPoints >= mMinimalPointsPerCluster) {
                if (m_debugPrint) {
                    if (name != null) {
                        logger.info("trainig last seen model for " + name);
                    }
                    for (Entry<Integer, Integer> pointDiff : mPointDifferences.entrySet()) {
                        logger.info("  " + pointDiff.getKey() + ", " + pointDiff.getValue());
                    }
                }
                computeScores();
            }
            mPointDifferences = null;
            mTrained = true;
        }

        /**
         * Calculate the scores for each point in "mPointDifferences" where the score is the log
         * of the ratio between max value and a point's value. (max is the time difference that occurs the
         * most in mPointDifferences and a point's value is the number of occurrences for 
         * this time difference/point). Only attempt this calculation if it satistifies the minimal
         * requirements as specified by the Property values and if the value is greater than 1.
         */
        private void computeScores() {
            final int max = getMaxValue();
            m_lMax = Math.log(max);
            m_llMax = -Math.log((double) max / mPoints);
            if (max > m_minimalMax && ((double) mPoints) / mPointDifferences.size() > m_minimalConcentration) {
                m_pointScores = new TreeMap<Integer, Double>();
                for (Entry<Integer, Integer> entry : mPointDifferences.entrySet()) {
                    final Integer v = entry.getValue();
                    if (v > 1) {
                        m_pointScores.put(entry.getKey(), m_lMax - Math.log(v));
                    }
                }
            }

        }

        /**
         * Get the time difference in seconds that occurred the most between message instances.
         * @return the point/delta/time difference in seconds between messages that occurred the most. 
         */
        private int getMaxValue() {
            int max = 0;
            for (Entry<Integer, Integer> e : mPointDifferences.entrySet()) {
                final int v = e.getValue();
                if (v > max) {
                    max = v;
                }
            }
            return max;
        }

        /**
         * Retrieves the score for each delta by using the scores calculated by "computerScores()." If the
         * score is not stored for a particular delta then set the score as skipScore where skipScore is retrieved
         * by obtaining the score at the current delta added to the previous delta. If this value isn't null, then 
         * use it otherwise, call getNeverSeenScore() to retrieve the score.
         * @param deltas change in seconds between two of of the same message instances where if MSG1 
         * occurred at time T then the other message instance time is less than time T and is the 
         * MOST RECENTLY seen message instance with this message id.
         * @return the scores for each delta.
         */
        public double[] getScore(Integer[] deltas) {
            if (deltas == null) {
                return new double[0];
            }
            assert (deltas.length > 0);
            if (m_pointScores == null) {
                return new double[0];
            }
            final double[] scores = new double[deltas.length];
            for (int idx = 0; idx < deltas.length; ++idx) {
                Double score = m_pointScores.get(deltas[idx]);
                if (idx > 0) { // if we have one extra message, 
                    final Double skipScore = m_pointScores.get(deltas[idx - 1] + deltas[idx]);
                    if (score == null || (skipScore != null && skipScore < score)) {
                        score = skipScore;
                    }
                }
                scores[idx] = (score != null) ? score : getNeverSeenScore();
            }
            return scores;

        }

        /**
         * Retrieve the m_llMax variable.
         * @return The log of the ratio between the max point and the total number of points.
         */
        public double getLLMax() {
            return m_llMax;
        }

        /**
         * The score for a delta value that isn't in m_pointScores.
         * @return log(.5) subtracted from the log of the max delta/value.
         */
        public double getNeverSeenScore() {
            return m_lMax - m_logHalf;
        }

        /**
         * Add a new point to the sequence. We add to m_PointDifferences the difference in seconds from the 
         * last point and the count for how many of this time difference we have seen between any two
         * consecutive message instances.
         * @param distanceFromLastPoint The number of seconds distance from the last point. i.e. the last
         * time this message instance occurred.
         */
        private void addPoint(int distanceFromLastPoint) {
            assert distanceFromLastPoint >= 0;
            Integer count = mPointDifferences.get(distanceFromLastPoint);
            if (count == null) {
                count = 0;
            }
            mPointDifferences.put(distanceFromLastPoint, count + 1);
            mPoints++;
        }

        /**
         * Print out the object state for debugging purposes.
         * @param name the message id
         * @param out the output stream for printing out object state.
         * @throws AdeException
         */
        public void debugPrint(String name, PrintStream out) throws AdeException {
            if (name != null) {
                out.println("Last seen model for " + name + ": " + getLLMax());
            }
            if (m_pointScores != null && !m_pointScores.isEmpty()) {
                for (Entry<Integer, Double> pointScore : m_pointScores.entrySet()) {
                    out.println("  " + pointScore.getKey() + ", " + pointScore.getValue());
                }
                out.println("  missing, " + -m_logHalf);
            }
        }
    }

    /**
     * Maps message id to its periodicity bounder.
     */
    private TreeMap<String, PerodicityBounder> m_lastSeen;

    /**
     * For determining if we have used this scorer during training.
     */
    transient private boolean m_trained = false;

    @Property(key = "printDebug", required = false, help = "Print debug information to system.out")
    private boolean m_debugPrint = false;

    /**
     * Gets the score for each delta as computed by the "getScore" method in PeriodicityBounder. Then set the
     * statistics chart with the scores obtained, The log of ratio between the max delta and the total number of deltas,
     * and get the max score seen in the set of scores for each delta to compute the "LobProbGiveLast" statistics
     * chart value. Then using the max score found calculate the probability and anomaly statistics. The max score,
     * probability and anomaly statistics are then added to the StatisticsChart.
     * @param analyzedMessageSummary The analysis results of a MessageSummary object. Message summaries contain 
     * statistics and information on message instances. i.e. text body message, message id, severity, etc.
     * @param analyzedInterval contains a summary of the interval i.e. information such as time, number of 
     * message ids, etc.
     * @return The StatisticsChart for collecting double and string statistics.
     */
    @Override
    public StatisticsChart getScore(IAnalyzedMessageSummary analyzedMessageSummary,
            IAnalyzedInterval analyzedInterval) throws AdeException {
        final StatisticsChart sc = new StatisticsChart();
        final String messageId = analyzedMessageSummary.getMessageId();

        double score = 0d;
        final PerodicityBounder omm = m_lastSeen.get(messageId);
        if (omm != null) {
            final double[] scores = omm.getScore(extractDelta(analyzedMessageSummary));
            if (scores.length != 0) {
                sc.setStat("PerTickScores", Arrays.toString(scores));
                for (int i = 0; i < scores.length; ++i) {
                    if (scores[i] > score) {
                        score = scores[i];
                    }
                }
                sc.setStat("LogProbGivenLast", score + omm.getLLMax());
                sc.setStat("maxl", omm.getLLMax());
            }
        } else {
            score = DEFAULT_FOR_NEW;
        }
        final Double prob = Math.exp(0 - score);
        sc.setStat(MAIN, prob);
        sc.setStat(ANOMALY, 1 - prob);
        sc.setStat(LOG_PROB, score);
        return sc;
    }

    /**
     * Determine if the learner needs another iteration by checking
     * if its been trained already.
     */
    @Override
    public boolean needsAnotherIteration() throws AdeException {
        return !m_trained;
    }

    /**
     * Start the learner iteration by initializing necessary variables.
     */
    @Override
    public void startIteration() throws AdeException {
        m_trained = false;
        m_lastSeen = new TreeMap<String, PerodicityBounder>();
    }

    @Override
    public void beginOfStream() throws AdeException, AdeFlowException {
    }

    /**
     * Go through each message and get the delta values between each instance for this interval.
     * Then add the points where the points are the delta values extracted.
     * @param analyzedInterval contains summary results of an interval.
     */
    @Override
    public void incomingObject(IAnalyzedInterval analyzedInterval) throws AdeException, AdeFlowException {
        for (IAnalyzedMessageSummary ms : analyzedInterval.getAnalyzedMessages()) {
            final String messageId = ms.getMessageId();
            final Integer[] delta = extractDelta(ms);
            PerodicityBounder perodicityBounder = m_lastSeen.get(messageId);
            if (perodicityBounder == null) {
                perodicityBounder = new PerodicityBounder();
                m_lastSeen.put(messageId, perodicityBounder);
            }
            perodicityBounder.addPoints(delta);
        }
    }

    /**
     * Gets the delta values (change in seconds between each message instance) calculated by the 
     * LastSeenLogginScoreContinuous class. Reformats the delta values so it the deltas are in an integer
     * array.
     * @param analyzedMessageSummary The analysis results of a MessageSummary object. Message summaries contain 
     * statistics and information on message instances. i.e. text body message, message id, severity, etc.
     * @return The delta values in an integer array.
     */
    protected Integer[] extractDelta(IAnalyzedMessageSummary ms) {
        final String rawDelta = ms.getStatistics()
                .getStringStat(LastSeenLoggingScorerContinuous.class.getSimpleName() + "." + "res");
        if (rawDelta.equals("[]")) {
            return null;
        }
        final List<String> stringDelta = Arrays
                .asList(StringUtils.split(StringUtils.substringBetween(rawDelta, "[", "]"), ", "));
        final Integer[] delta = new Integer[stringDelta.size()];
        for (int i = 0; i < stringDelta.size(); ++i) {
            delta[i] = Integer.decode(stringDelta.get(i));
        }
        return delta;
    }

    /**
     * At the end of the stream, get all the message ids and run a train for each
     * message id.
     */
    @Override
    public void endOfStream() throws AdeException, AdeFlowException {
        for (Entry<String, PerodicityBounder> last : m_lastSeen.entrySet()) {
            final PerodicityBounder pb = last.getValue();
            pb.train(last.getKey());
        }
        m_trained = true;
    }

    /**
     * Print full object state for debug purposes.
     * @param out output stream to print out object state.
     */
    @Override
    public void debugPrint(PrintStream out) throws AdeException {
        super.debugPrint(out);
        for (Entry<String, PerodicityBounder> last : m_lastSeen.entrySet()) {
            last.getValue().debugPrint(last.getKey(), out);
        }

    }

}