edu.uga.cs.fluxbuster.features.FeatureCalculator.java Source code

Java tutorial

Introduction

Here is the source code for edu.uga.cs.fluxbuster.features.FeatureCalculator.java

Source

/*
* Copyright (C) 2012 Chris Neasbitt
* Author: Chris Neasbitt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package edu.uga.cs.fluxbuster.features;

import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.Formatter;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.joda.time.DateTime;
import org.joda.time.Days;

import edu.uga.cs.fluxbuster.db.DBInterface;
import edu.uga.cs.fluxbuster.db.DBInterfaceFactory;
import edu.uga.cs.fluxbuster.utils.PropertiesUtils;

/**
 * This class calculates longitudinal features of each cluster and
 * stores them in the database.
 * 
 * @author Chris Neasbitt
 */
public class FeatureCalculator {

    private DBInterface dbi = null;

    private Properties properties = null;

    private SimpleDateFormat df = null;

    private ArrayList<Date> prevDateBuf = null;
    private Date prevDateBufDate = null;
    private int prevDateBufWindow = 0;

    private static final String TABLES_QUERY1KEY = "TABLES_QUERY1";

    private static final String DOMAINSPREFIXKEY = "DOMAINS_TABLE_PREFIX";

    private static final String RESIPSPREFIXKEY = "RESIPS_TABLE_PREFIX";

    private static final String NOVELTY_QUERY1_1KEY = "NOVELTY_QUERY1_PART1";

    private static final String NOVELTY_QUERY1_2KEY = "NOVELTY_QUERY1_PART2";

    private static final String NOVELTY_QUERY1_3KEY = "NOVELTY_QUERY1_PART3";

    private static final String NOVELTY_QUERY2KEY = "NOVELTY_QUERY2";

    private static final String NOVELTY_WINDOWSKEY = "NOVELTY_WINDOWS";

    private static final String NOVELTY_WINFIELDSKEY = "NOVELTY_WINDOW_FIELDS";

    private static final String NOVELTY_QUERY3KEY = "NOVELTY_QUERY3";

    private static final String PREVCLUSTER_QUERY1KEY = "PREVCLUSTER_QUERY1";

    private static final String PREVCLUSTER_QUERY2KEY = "PREVCLUSTER_QUERY2";

    private static final String PREVCLUSTER_QUERY3KEY = "PREVCLUSTER_QUERY3";

    private static final String PREVCLUSTER_QUERY4KEY = "PREVCLUSTER_QUERY4";

    private static final String PREVCLUSTER_WINDOWKEY = "PREVCLUSTER_WINDOW";

    private static final String DOMAINSPERNETWORK_WINDOWKEY = "DOMAINSPERNETWORK_WINDOW";

    private static final String DOMAINSPERNETWORK_QUERY1KEY = "DOMAINSPERNETWORK_QUERY1";

    private static final String DOMAINSPERNETWORK_QUERY2KEY = "DOMAINSPERNETWORK_QUERY2";

    private static final String DOMAINSPERNETWORK_QUERY3KEY = "DOMAINSPERNETWORK_QUERY3";

    private static Log log = LogFactory.getLog(FeatureCalculator.class);

    /**
     * Instantiates a new feature calculator.
     *
     * @throws IOException if the FeatureCalculator.properties file
     *       can not be read
     */
    public FeatureCalculator() throws IOException {
        this(DBInterfaceFactory.loadDBInterface());
    }

    /**
     * Instantiates a new feature calculator with a specific database
     * interface.
     *
     * @param dbi the database interface
     * @throws IOException if the FeatureCalculator.properties file
     *       can not be read
     */
    public FeatureCalculator(DBInterface dbi) throws IOException {
        this.dbi = dbi;
        properties = PropertiesUtils.loadProperties(this.getClass());
        df = new SimpleDateFormat("yyyyMMdd");
    }

    /**
     * Calculates the domains per network feature for each cluster generated
     * on a specific run date.
     * 
     * @param log_date the run date
     * @param window the number of days previous to use in feature calculation
     * @return a table of values where the keys are cluster ids and the values 
     *       are the feature values
     * @throws SQLException if there is an error calculating the feature values
     */

    public Map<Integer, Double> calculateDomainsPerNetwork(Date log_date, int window) throws SQLException {
        HashMap<Integer, Double> retval = new HashMap<Integer, Double>();
        ArrayList<Date> prevDates = getPrevDates(log_date, window);
        if (prevDates.size() > 0) {
            String logDateStr = df.format(log_date);
            StringBuffer add_query = new StringBuffer();
            Formatter formatter = new Formatter(add_query);

            for (Date prevDate : prevDates) {
                String prevDateStr = df.format(prevDate);
                formatter.format(" " + properties.getProperty(DOMAINSPERNETWORK_QUERY1KEY) + " ", logDateStr,
                        prevDateStr, prevDateStr);
            }
            formatter.close();

            StringBuffer querybuf = new StringBuffer();
            formatter = new Formatter(querybuf);
            formatter.format(properties.getProperty(DOMAINSPERNETWORK_QUERY2KEY), logDateStr, logDateStr,
                    logDateStr, add_query.toString());
            ResultSet rs = null;
            try {
                rs = dbi.executeQueryWithResult(querybuf.toString());
                while (rs.next()) {
                    retval.put(rs.getInt(1), rs.getDouble(2));
                }
            } catch (Exception e) {
                if (log.isErrorEnabled()) {
                    log.error(e);
                }
            } finally {
                if (rs != null && !rs.isClosed()) {
                    rs.close();
                }
                formatter.close();
            }
        }
        return retval;
    }

    /**
     * Calculates the cluster novelty feature for each cluster generated
     * on a specific run date.
     *
     * @param log_date the run date
     * @param window the number of days previous to use in feature calculation
     * @return a table of values where the keys are cluster ids and the values 
     *       are the feature values
     * @throws SQLException if there is an error calculating the feature values
     */
    public Map<Integer, Double> calculateNoveltyFeature(Date log_date, int window) throws SQLException {
        HashMap<Integer, Double> retval = new HashMap<Integer, Double>();
        ArrayList<Date> prevDates = getPrevDates(log_date, window);

        if (prevDates.size() > 0) {
            StringBuffer querybuf = new StringBuffer();
            Formatter formatter = new Formatter(querybuf);
            String curdatestr = df.format(log_date);
            formatter.format(properties.getProperty(NOVELTY_QUERY1_1KEY), curdatestr, curdatestr, curdatestr,
                    curdatestr);
            for (Date prevDate : prevDates) {
                formatter.format(" " + properties.getProperty(NOVELTY_QUERY1_2KEY) + " ", df.format(prevDate));
            }
            formatter.format(properties.getProperty(NOVELTY_QUERY1_3KEY), curdatestr, curdatestr);

            ResultSet rs2 = null;
            Hashtable<Integer, Hashtable<String, Long>> new_resolved_ips = new Hashtable<Integer, Hashtable<String, Long>>();
            try {
                rs2 = dbi.executeQueryWithResult(querybuf.toString());
                while (rs2.next()) {
                    int cluster_id = rs2.getInt(2);
                    if (!new_resolved_ips.containsKey(cluster_id)) {
                        new_resolved_ips.put(cluster_id, new Hashtable<String, Long>());
                    }
                    String secondLevelDomainName = rs2.getString(1);
                    long newips = rs2.getLong(3);
                    Hashtable<String, Long> clustertable = new_resolved_ips.get(cluster_id);
                    clustertable.put(secondLevelDomainName, newips);
                }
            } catch (Exception e) {
                if (log.isErrorEnabled()) {
                    log.error(e);
                }
            } finally {
                if (rs2 != null && !rs2.isClosed()) {
                    rs2.close();
                }
                formatter.close();
            }

            Hashtable<String, List<Integer>> numDays = new Hashtable<String, List<Integer>>();
            for (Date prevDate : prevDates) {
                String prevDateStr = df.format(prevDate);
                querybuf = new StringBuffer();
                formatter = new Formatter(querybuf);
                formatter.format(properties.getProperty(NOVELTY_QUERY2KEY), curdatestr, prevDateStr, curdatestr,
                        prevDateStr);
                ResultSet rs3 = null;
                try {
                    rs3 = dbi.executeQueryWithResult(querybuf.toString());
                    while (rs3.next()) {
                        String sldn = rs3.getString(1);
                        if (!numDays.containsKey(sldn)) {
                            numDays.put(sldn, new ArrayList<Integer>());
                        }
                        Date pd = rs3.getDate(2);
                        DateTime start = new DateTime(pd.getTime());
                        DateTime end = new DateTime(log_date.getTime());
                        Days d = Days.daysBetween(start, end);
                        int diffDays = d.getDays();
                        numDays.get(sldn).add(diffDays);
                    }
                } catch (Exception e) {
                    if (log.isErrorEnabled()) {
                        log.error(e);
                    }
                } finally {
                    if (rs3 != null && !rs3.isClosed()) {
                        rs3.close();
                    }
                    formatter.close();
                }
            }

            Hashtable<Integer, List<Float>> clusterValues = new Hashtable<Integer, List<Float>>();
            for (int clusterID : new_resolved_ips.keySet()) {
                clusterValues.put(clusterID, new ArrayList<Float>());

                Hashtable<String, Long> sldnValues = new_resolved_ips.get(clusterID);
                for (String sldn : sldnValues.keySet()) {
                    if (numDays.keySet().contains(sldn)) {
                        long newIPCount = sldnValues.get(sldn);
                        float f = ((float) newIPCount) / Collections.max(numDays.get(sldn));
                        clusterValues.get(clusterID).add(f);

                    }
                }
            }

            for (int clusterID : clusterValues.keySet()) {
                if (clusterValues.get(clusterID) == null) { //I dont think it is possible for this to ever be true
                    retval.put(clusterID, null);
                } else {
                    double sum = 0;
                    for (double d : clusterValues.get(clusterID)) {
                        sum += d;
                    }
                    double val = 0;
                    if (clusterValues.get(clusterID).size() > 0) {
                        val = sum / clusterValues.get(clusterID).size();
                    }
                    retval.put(clusterID, val);
                }
            }
        }
        return retval;
    }

    /**
     * Calculates the previous cluster ratio feature for each cluster generated
     * on a specific run date and within the a specific window
     *
     * @param log_date the run date
     * @param window the number of days previous to use in feature calculation
     * @return a table of results, the keys of the table are cluster ids and the
     *       values are lists of two elements.  The first element is the 
     *       last_growth_ratio_prev_clusters value and the second element is the
     *       last_growth_prefix_ratio_prev_clusters value
     * @throws SQLException if there is and error calculating the feature
     */
    public Hashtable<Integer, List<Double>> calculatePrevClusterRatios(Date log_date, int window)
            throws SQLException {
        Hashtable<Integer, List<Double>> retval = new Hashtable<Integer, List<Double>>();

        ArrayList<Date> prevDates = getPrevDates(log_date, window);
        String query1 = properties.getProperty(PREVCLUSTER_QUERY1KEY);
        String query2 = properties.getProperty(PREVCLUSTER_QUERY2KEY);
        String logDateStr = df.format(log_date);
        String completequery = new String();

        StringBuffer addQueryBuff = new StringBuffer();
        for (int i = 0; i < prevDates.size(); i++) {
            String prevDateStr = df.format(prevDates.get(i));
            StringBuffer querybuf = new StringBuffer();
            Formatter formatter = new Formatter(querybuf);
            formatter.format(query1, logDateStr, logDateStr, prevDateStr, prevDateStr, prevDateStr);
            addQueryBuff.append(querybuf.toString());
            if (i < prevDates.size() - 1) {
                addQueryBuff.append(" UNION ");
            }
            formatter.close();
        }

        if (addQueryBuff.length() > 0) {
            StringBuffer querybuf = new StringBuffer();
            Formatter formatter = new Formatter(querybuf);
            formatter.format(query2, logDateStr, logDateStr, addQueryBuff.toString());
            completequery = querybuf.toString();
            formatter.close();
        }

        if (completequery.length() > 0) {
            ResultSet rs = null;
            try {
                rs = dbi.executeQueryWithResult(completequery);
                while (rs.next()) {
                    ArrayList<Double> temp = new ArrayList<Double>();
                    temp.add(rs.getDouble(3));
                    temp.add(rs.getDouble(4));
                    retval.put(rs.getInt(1), temp);
                }
            } catch (Exception e) {
                if (log.isErrorEnabled()) {
                    log.error(e);
                }
            } finally {
                if (rs != null && !rs.isClosed()) {
                    rs.close();
                }
            }
            Hashtable<Integer, Double> queryPerDomain = getQueriesPerDomain(log_date);
            for (Integer clusterid : retval.keySet()) {
                List<Double> values = retval.get(clusterid);
                values.set(0, values.get(0) / queryPerDomain.get(clusterid));
                values.set(1, values.get(1) / queryPerDomain.get(clusterid));
            }
        }

        return retval;
    }

    /**
     * Gets run dates previous to a specific date within a window
     * of days from that date.
     *
     * @param log_date the run date
     * @param window the number of days previous to the current date
     * @return the list of previous run dates
     * @throws SQLException if there is an error retrieving the previous
     *       run dates
     */
    public ArrayList<Date> getPrevDates(Date log_date, int window) throws SQLException {
        ArrayList<Date> prevDates = new ArrayList<Date>();
        if (prevDateBufDate != null && prevDateBuf != null && prevDateBufDate.equals(log_date)
                && prevDateBufWindow >= window) {

            //pull the dates within the day window from the prevDateBuf cache
            Date pd = null;
            int windowcount = 0;
            for (Date d : prevDateBuf) {
                if (windowcount >= window) {
                    break;
                }
                if (pd == null) {
                    pd = d;
                    windowcount++;
                } else {
                    DateTime morerecent = new DateTime(d.getTime());
                    DateTime lessrecent = new DateTime(pd.getTime());
                    Days days = Days.daysBetween(morerecent, lessrecent);
                    windowcount += days.getDays();
                    pd = d;
                }
                prevDates.add(d);
            }

        } else {
            String domainsprefix = properties.getProperty(DOMAINSPREFIXKEY);
            String resipsprefix = properties.getProperty(RESIPSPREFIXKEY);

            ArrayList<String> tablenames = new ArrayList<String>();
            ResultSet rs1 = null;
            try {
                rs1 = dbi.executeQueryWithResult(properties.getProperty(TABLES_QUERY1KEY));
                while (rs1.next()) {
                    tablenames.add(rs1.getString(1));
                }
            } catch (Exception e) {
                if (log.isErrorEnabled()) {
                    log.error(e);
                }
            } finally {
                if (rs1 != null && !rs1.isClosed()) {
                    rs1.close();
                }
            }

            GregorianCalendar cal = new GregorianCalendar();
            cal.setTime(log_date);
            for (int i = 0; i < window; i++) {
                cal.roll(Calendar.DAY_OF_YEAR, false);
                Date temp = cal.getTime();
                String datestr = df.format(temp);
                if (tablenames.contains(domainsprefix + "_" + datestr)
                        && tablenames.contains(resipsprefix + "_" + datestr)) {
                    prevDates.add(temp);
                }
            }

            //cache the values for later
            if (prevDateBuf == null) {
                prevDateBuf = new ArrayList<Date>();
            } else {
                prevDateBuf.clear();
            }
            prevDateBuf.addAll(prevDates);
            prevDateBufDate = log_date;
            prevDateBufWindow = window;
        }
        return prevDates;
    }

    /**
     * Retrieves the number of dns queries per domain for each cluster
     * generated on a specific run date.
     *
     * @param log_date the run date
     * @return a table of values where the keys are cluster ids and the values 
     *       are the queries per domain value
     * @throws SQLException if there is an error retrieving the queries
     *       per domain values
     */
    private Hashtable<Integer, Double> getQueriesPerDomain(Date log_date) throws SQLException {
        Hashtable<Integer, Double> retval = new Hashtable<Integer, Double>();
        StringBuffer querybuf = new StringBuffer();
        Formatter formatter = new Formatter(querybuf);
        formatter.format(properties.getProperty(PREVCLUSTER_QUERY3KEY), df.format(log_date));
        ResultSet rs = null;
        try {
            rs = dbi.executeQueryWithResult(querybuf.toString());
            while (rs.next()) {
                retval.put(rs.getInt(1), rs.getDouble(2));
            }
        } catch (Exception e) {
            if (log.isErrorEnabled()) {
                log.error(e);
            }
        } finally {
            if (rs != null && !rs.isClosed()) {
                rs.close();
            }
            formatter.close();
        }
        return retval;
    }

    /**
     * Calculates the domains per network feature for each cluster generated
     * on a specific run date and stores them in the database.
     * 
     * @param log_date the run date
     * @throws Exception if there is an error calculating or storing the 
     *       feature values
     */
    public void updateDomainsPerNetwork(Date log_date) throws Exception {
        Map<Integer, Double> dpn = this.calculateDomainsPerNetwork(log_date,
                Integer.parseInt(properties.getProperty(DOMAINSPERNETWORK_WINDOWKEY)));
        for (int clusterid : dpn.keySet()) {
            StringBuffer querybuf = new StringBuffer();
            Formatter formatter = new Formatter(querybuf);
            formatter.format(properties.getProperty(DOMAINSPERNETWORK_QUERY3KEY), df.format(log_date),
                    dpn.get(clusterid).toString(), String.valueOf(clusterid));
            dbi.executeQueryNoResult(querybuf.toString());
            formatter.close();
        }
    }

    /**
     * Updates each cluster's longitudinal features for all clusters
     * generated during a specific run date.
     *
     * @param log_date the run date
     * @throws Exception if unable to calculate or store the longitudinal
     *       feature values
     */
    public void updateFeatures(Date log_date) throws Exception {
        String simplename = null;
        if (log.isInfoEnabled()) {
            simplename = this.getClass().getSimpleName();
            log.info(simplename + " Started: " + Calendar.getInstance().getTime());
            log.info("Updating novelty features.");
        }
        dbi.initClusterTables(log_date);
        updateNoveltyFeature(log_date);
        if (log.isInfoEnabled()) {
            log.info("Novelty features updated.");
            log.info("Updating previous cluster ratio features.");
        }
        updatePrevClusterRatios(log_date);
        if (log.isInfoEnabled()) {
            log.info("Previous cluster ratio features updated.");
            log.info("Updating domains per network feature.");
        }
        updateDomainsPerNetwork(log_date);
        if (log.isInfoEnabled()) {
            log.info("Domains per network feature updated.");
            log.info(simplename + " Finished: " + Calendar.getInstance().getTime());
        }
    }

    /**
     * Calculates the cluster novelty feature for each cluster generated
     * on a specific run date and stores them in the database.
     *
     * @param log_date the run date
     * @throws Exception if there is an error calculating or storing the feature
     *       values
     */
    public void updateNoveltyFeature(Date log_date) throws Exception {
        Map<Integer, String> windowvals = new TreeMap<Integer, String>();
        String[] windowsstr = properties.getProperty(NOVELTY_WINDOWSKEY).split(",");
        String[] windowfields = properties.getProperty(NOVELTY_WINFIELDSKEY).split(",");

        if (windowfields.length != windowsstr.length) {
            throw new Exception("Number of novelty window values and fields do not match.");
        }

        for (int i = 0; i < windowsstr.length; i++) {
            windowvals.put(Integer.parseInt(windowsstr[i]), windowfields[i]);
        }

        //We start from largest window to smallest so we can cache the prevDates results for later use
        List<Integer> windowkeys = new ArrayList<Integer>(windowvals.keySet());
        Collections.reverse(windowkeys);

        for (int window : windowkeys) {
            Map<Integer, Double> novelty = calculateNoveltyFeature(log_date, window);
            for (int clusterid : novelty.keySet()) {
                StringBuffer querybuf = new StringBuffer();
                Formatter formatter = new Formatter(querybuf);
                formatter.format(properties.getProperty(NOVELTY_QUERY3KEY), df.format(log_date),
                        windowvals.get(window), String.valueOf(novelty.get(clusterid)), String.valueOf(clusterid),
                        df.format(log_date));
                dbi.executeQueryNoResult(querybuf.toString());
                formatter.close();
            }
        }

    }

    /**
     * Calculates the previous cluster ratio feature for each cluster generated
     * on a specific run date and stores them in the database.
     *
     * @param log_date the run date
     * @throws SQLException if the feature values can not be stored in the database
     */
    public void updatePrevClusterRatios(Date log_date) throws SQLException {
        Hashtable<Integer, List<Double>> ratios = this.calculatePrevClusterRatios(log_date,
                Integer.parseInt(properties.getProperty(PREVCLUSTER_WINDOWKEY)));
        for (int clusterid : ratios.keySet()) {
            List<Double> ratiovals = ratios.get(clusterid);
            StringBuffer querybuf = new StringBuffer();
            Formatter formatter = new Formatter(querybuf);
            formatter.format(properties.getProperty(PREVCLUSTER_QUERY4KEY), df.format(log_date),
                    ratiovals.get(0).toString(), ratiovals.get(1).toString(), Integer.toString(clusterid));
            dbi.executeQueryNoResult(querybuf.toString());
            formatter.close();
        }
    }
}