logfilegenerator.loglineanalyser.modules.Cluster.java Source code

Introduction

Here is the source code for logfilegenerator.loglineanalyser.modules.Cluster.java
Source

/*
*LogFileGenerator (LFG) allows generating Network Event Sequence (NES) data of any length based on a small piece of real log data.
*
*Copyright (C) 2016 Markus Wurzenberger
*
*This program is free software; you can redistribute it and/or modify
*it under the terms of the GNU General Public License as published by
*the Free Software Foundation; either version 3 of the License, or
*(at your option) any later version.
*
*This program is distributed in the hope that it will be useful,
*but WITHOUT ANY WARRANTY; without even the implied warranty of
*MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*GNU General Public License for more details.
*
*You should have received a copy of the GNU General Public License
*along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package logfilegenerator.loglineanalyser.modules;

import java.util.ArrayList;

import org.apache.commons.lang3.StringUtils;

import logfilegenerator.loglineanalyser.manager.LogLineManager;

/*
 * This class stores cluster
 */

public class Cluster {

    private int clusterID;
    private String descr;
    private String regex;
    private int clusterValue;
    private LogLineManager logLines;
    private LogLineManager mostAccurateLogLines;
    private ArrayList<Long> timeStampDistribution;
    private String[][] wildcardContent;
    private int numberOfWildcards;

    public Cluster(int clusterCount, String cluster) {
        super();
        this.setClusterID(clusterCount);
        this.setDescr(cluster);
        this.setRegex(generateRegexFromDescription(cluster));
        this.setClusterValue(calculateClusterValue(cluster));
        this.logLines = new LogLineManager();
        this.mostAccurateLogLines = new LogLineManager();
        this.timeStampDistribution = new ArrayList<Long>();
        this.numberOfWildcards = StringUtils.countMatches(this.getDescr(), "<*");
    }

    //calculate the clusterValue, which specifies, how many frequent words define the cluster
    private int calculateClusterValue(String cluster) {

        int wordCount = 0;
        //split the cluster description at every space
        String[] s = cluster.split(" ");

        //iterate through the string list
        for (int i = 0; i < s.length; i++) {
            //if s[i] is not equal to the wildcard symbol, raise the wordCount
            if (s[i].equals("<*")) {
                continue;
            } else {
                wordCount += 1;
            }
        }
        System.out.println("clusterValue: " + wordCount + "\n");
        return wordCount;
    }

    //generate a regular expression from the cluster description
    private String generateRegexFromDescription(String cluster) {

        StringBuffer sbuf = new StringBuffer();

        //EDIT: better first remove the time stamp of the logline, as SLCT does. do not allow a reluctant 
        //number of characters in the beginning, because in this case, the regular expression will also match 
        //lines, where the significant words are found on different positions.
        //NOTE: SLCT cares about the position of a word

        //store the cluster description in a List, splitted by the SLCT wildcard symbol <*
        String[] s = descr.split("\\<\\*");

        //iterate through the string list
        for (int i = 0; i < s.length; i++) {
            s[i] = s[i].replaceAll("(\\?)", "\\\\$1");
            s[i] = s[i].replaceAll("([\\[\\]\\(\\)\\+\\|\\$\\^\\{\\}\\.\\*])", "\\\\$1");
            s[i] = s[i].replace("\\\\", "\\\\");

            sbuf.append(s[i]);
            //replace SLCT wildcard symbol
            //NOTE: be careful if a line starts with the wildcard symbol
            if (i == 0 && s[i].equals("") && !(s[i + 1].substring(0, 1).equals(" "))) {
                sbuf.append("\\S*");
            } else if (i == 0 && s[i].equals(""))
                sbuf.append("\\S+");
            else if (!(i + 1 == s.length)) {
                //\S*? or \S* or \S+ ?
                //EDIT: has to be \S+, because SLCT does not consider a space as a word and \S* allows spaces,
                //because then the next "empty" string is considered as a word.
                //EDIT: if in SLCT the refine option is used, some wildcard symbols are extended with characters. In this case, the wildcard 
                //symbol can be also replaced by an empty string.
                if (!(s[i].substring(s[i].length() - 1).equals(" ")) || !(s[i + 1].substring(0, 1).equals(" "))) {
                    sbuf.append("\\S*");
                } else {
                    sbuf.append("\\S+");
                }
            }
        }
        //delete the last character, because it is a space. Therefore, \S+ .* is returned instead of \S+.*.
        sbuf.deleteCharAt(sbuf.length() - 1);
        if (descr.trim().substring(descr.length() - 3).equals("<*")) {
            sbuf.append(".*");
        } else {
            //if the cluster description doesn't end with a wild card, the $ symbol makes sure, that the last symbol of the log line, which matches
            //the cluster, is equal to the last symbol of the regular expression
            sbuf.append("$.*");
        }

        System.out.println("generated regex: " + sbuf.toString());

        return sbuf.toString();
    }

    public int getClusterID() {
        return clusterID;
    }

    public void setClusterID(int clusterID) {
        this.clusterID = clusterID;
    }

    public String getDescr() {
        return descr;
    }

    public void setDescr(String descr) {
        this.descr = descr;
    }

    public String getRegex() {
        return regex;
    }

    public void setRegex(String regex) {
        this.regex = regex;
    }

    public int getClusterValue() {
        return clusterValue;
    }

    public void setClusterValue(int clusterValue) {
        this.clusterValue = clusterValue;
    }

    public LogLineManager getLogLines() {
        return logLines;
    }

    public void setLogLines(LogLineManager logLines) {
        this.logLines = logLines;
    }

    public LogLineManager getMostAccurateLogLines() {
        return mostAccurateLogLines;
    }

    public void setMostAccurateLogLines(LogLineManager mostAccurateLogLines) {
        this.mostAccurateLogLines = mostAccurateLogLines;
    }

    public ArrayList<Long> getTimeStampDistribution() {
        return timeStampDistribution;
    }

    public void setTimeStampDistribution(ArrayList<Long> timeStampDistribution) {
        this.timeStampDistribution = timeStampDistribution;
    }

    public String[][] getWildcardContent() {
        return wildcardContent;
    }

    public void setWildcardContent(String[][] wildcardContent) {
        this.wildcardContent = wildcardContent;
    }

    public int getNumberOfWildcards() {
        return numberOfWildcards;
    }

    public void setNumberOfWildcards(int numberOfWildcards) {
        this.numberOfWildcards = numberOfWildcards;
    }

}