Java tutorial
/* * Copyright (C) 2012 Chris Neasbitt * Author: Chris Neasbitt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.uga.cs.fluxbuster.clustering; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import java.util.Vector; import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.joda.time.DateTime; import edu.uga.cs.fluxbuster.classification.ClusterClass; import edu.uga.cs.fluxbuster.clustering.hierarchicalclustering.Dendrogram; import edu.uga.cs.fluxbuster.clustering.hierarchicalclustering.DistanceMatrix; import edu.uga.cs.fluxbuster.clustering.hierarchicalclustering.HCluster; import edu.uga.cs.fluxbuster.clustering.hierarchicalclustering.HierarchicalClustering; import edu.uga.cs.fluxbuster.clustering.hierarchicalclustering.HierarchicalClustering.LinkageType; import edu.uga.cs.fluxbuster.db.DBInterface; import edu.uga.cs.fluxbuster.db.DBInterfaceFactory; import edu.uga.cs.fluxbuster.utils.PropertiesUtils; import edu.uga.cs.fluxbuster.utils.DomainNameUtils; /** * This class initiates the hierarchical clustering process. * * @author Chris Neasbitt */ public class ClusterGenerator { private ArrayList<String> domainWhitelist = null; private Properties localprops = null, appprops = null; private static final String WHITELISTKEY = "WHITELIST_FILE"; private static final String GAMMAKEY = "GAMMA"; private static final String FLUXFILEREGEXKEY = "CANDIDATE_FLUX_FILE_REGEX"; private static final String FLUXFILEPARSEREGEXKEY = "CANDIDATE_FLUX_FILE_PARSING_REGEX"; private static final String FLUXDIRKEY = "CANDIDATE_FLUX_DIR"; private static final String MINRRSETSIZEKEY = "MIN_TOTAL_RRSET_SIZE"; private static final String MINDIVERSITYKEY = "MIN_TOTAL_DIVERSITY"; private static final String SHORTTTLKEY = "VERY_SHORT_TTL"; private static final String CANDIDATETHRESHKEY = "GOOD_CANDIDATE_THRESHOLD"; private static final String MAXDOMAINSKEY = "MAX_CANDIDATE_DOMAINS"; private static final String LINKAGETYPEKEY = "LINKAGE_TYPE"; private static final String MAXCUTHEIGHTKEY = "MAX_CUT_HEIGHT"; private static final String DISTMATRIXKEY = "DIST_MATRIX_MULTITHREADED"; private static final String DISTNUMTHREADSKEY = "DIST_MATRIX_NUMTHREADS"; private static final String SELECTEDCFDFILEKEY = "SELECTED_CFD_FILE"; private static Log log = LogFactory.getLog(ClusterGenerator.class); /** * Instantiates a new cluster generator. * * @throws IOException if the ClusterGenerator.localprops file can * not be read */ public ClusterGenerator() throws IOException { localprops = PropertiesUtils.loadProperties(this.getClass()); appprops = PropertiesUtils.loadAppWideProperties(); try { loadWhitelist(); } catch (IOException e) { if (log.isErrorEnabled()) { log.error("Error loading domain whitelist.", e); } } } /** * Load the domain whitelist. * * @throws IOException if the whitelist file can not be read */ private void loadWhitelist() throws IOException { domainWhitelist = new ArrayList<String>(); String whitelistfile = localprops.getProperty(WHITELISTKEY); BufferedReader br = new BufferedReader(new FileReader(whitelistfile)); String line; while ((line = br.readLine()) != null) { domainWhitelist.add(DomainNameUtils.stripDots(line.trim())); } br.close(); } /** * Compute a distance matrix from a list of candidate flux domains. * * @param cfds the candidate flux domains * @return the vector of values in the distance matrix in row major * order */ private Vector<Float> computeDistanceMatrix(List<CandidateFluxDomain> cfds) { boolean multithread = Boolean.parseBoolean(appprops.getProperty(DISTMATRIXKEY)); if (multithread) { int numthreads = Integer.parseInt(appprops.getProperty(DISTNUMTHREADSKEY)); if (numthreads < 1) { numthreads = 1; } return computeDistanceMatrixMultiThreaded(cfds, numthreads); } else { return computeDistanceMatrixMultiThreaded(cfds, 1); } } /** * Compute a distance matrix from a list of candidate flux domains with * a maximum number of calculation threads. * * @param cfds the list of candidate flux domains * @param maxnumthreads the thread ceiling * @return the vector of values in the distance matrix in row major * order */ private Vector<Float> computeDistanceMatrixMultiThreaded(List<CandidateFluxDomain> cfds, int maxnumthreads) { Vector<Float> retval = new Vector<Float>(); ThreadFactory tf = Executors.defaultThreadFactory(); double gamma = Double.parseDouble(localprops.getProperty(GAMMAKEY)); ArrayList<Thread> threads = new ArrayList<Thread>(); ArrayList<HashSet<Integer>> threadrows = new ArrayList<HashSet<Integer>>(); int interval = (int) Math.ceil((cfds.size() - 1) / (double) maxnumthreads); int left = 0; int right = cfds.size() - 2; HashSet<Integer> curset = null; boolean addLeftFirst = true; while (left <= right) { if (curset == null) { curset = new HashSet<Integer>(); } if (curset.size() == interval) { threadrows.add(curset); curset = null; } else { if (addLeftFirst) { curset.add(left++); } else { curset.add(right--); } addLeftFirst = !addLeftFirst; if (curset.size() == interval) { continue; } if (addLeftFirst) { curset.add(left++); } else { curset.add(right--); } } } if (curset != null && curset.size() > 0) { threadrows.add(curset); } ArrayList<Vector<Float>> resultsList = new ArrayList<Vector<Float>>(cfds.size()); // this is necessary to make sure that the proper indexes exist in // resultsList before being accessed by the threads for (int i = 0; i < cfds.size() - 1; i++) { resultsList.add(null); } for (int i = 0; i < threadrows.size(); i++) { Thread t = tf.newThread(new DistanceMatrixCalculator(gamma, threadrows.get(i), cfds, resultsList)); threads.add(t); } for (Thread t : threads) { t.start(); } for (Thread t : threads) { try { t.join(); } catch (InterruptedException e) { e.printStackTrace(); } } for (int i = 0; i < resultsList.size(); i++) { retval.addAll(resultsList.get(i)); } return retval; } /** * Determines if a domain name is in the whitelist. * * @param domainname the domain name * @return true, if the domain name is on the whitelist */ private boolean isWhiteListable(String domainname) { for (String d : domainWhitelist) { if (domainname.endsWith(d)) { return true; } } return false; } // TODO improve the candidate score algorithm /** * Calculates the candidate flux domains clustering potential. This * value is used to sort which candidate flux domains are the best * candidates for clustering. * * @param cfd the candidate flux domain * @return the candidate score */ public double calcCandidateScore(CandidateFluxDomain cfd) { int minTotalRrsetSize = Integer.parseInt(localprops.getProperty(MINRRSETSIZEKEY)); double minTotalDiversity = Double.parseDouble(localprops.getProperty(MINDIVERSITYKEY)); double veryShortTTL = Double.parseDouble(localprops.getProperty(SHORTTTLKEY)); double ipDiv = IPDiversityCalculator.ipDiversity(IPDiversityCalculator.getV4Ips(cfd.getIps())); if (cfd.getNumIPs() >= minTotalRrsetSize && ipDiv > minTotalDiversity) { return 1.0; } else if (cfd.getNumIPs() == 1 && cfd.getAvgTTL() <= veryShortTTL) { return 1.0; } return 0.0; } /** * Load candidate flux domains from the data files for the time period * between the start and end times. * * @param startTime the start time in sec. * @param endTime the end time in sec. * @param domainfile a file containing the list of domains that should * be clustered regardless of the candidate score. If null the list * is ignored. * @return the list of candidate flux domains * @throws Exception if there is an error reading the ClusterGenerator.localprops * or data files */ public List<CandidateFluxDomain> loadCandidateFluxDomains(long startTime, long endTime, String domainfile) throws Exception { ArrayList<CandidateFluxDomain> retval = new ArrayList<CandidateFluxDomain>(); HashMap<String, CandidateFluxDomain> seenDomains = new HashMap<String, CandidateFluxDomain>(); Set<String> recentFluxDomains = loadRecentFluxDomains(startTime); String dirPath = appprops.getProperty(FLUXDIRKEY); double goodCandidateThreshold = Double.parseDouble(appprops.getProperty(CANDIDATETHRESHKEY)); int maxCandidateDomains = Integer.parseInt(appprops.getProperty(MAXDOMAINSKEY)); for (String filename : getFileNames(dirPath, startTime, endTime)) { BufferedReader br = null; try { GZIPInputStream gis = new GZIPInputStream(new FileInputStream(filename)); br = new BufferedReader(new InputStreamReader(gis)); String line; while ((line = br.readLine()) != null) { CandidateFluxDomain cfd = CandidateFluxDomain.parseFromLog(line); if (isWhiteListable(cfd.getDomainName())) { if (log.isDebugEnabled()) { log.debug(cfd.getDomainName() + " is whitelisted."); } continue; } String domainname = cfd.getDomainName(); if (seenDomains.containsKey(domainname)) { CandidateFluxDomain prev = seenDomains.get(domainname); seenDomains.put(domainname, prev.merge(cfd)); } else { seenDomains.put(domainname, cfd); } } } catch (Exception e) { e.printStackTrace(); } finally { if (br != null) { br.close(); } } } //add all domains from a file if (domainfile != null) { addDomainsFromFile(domainfile, maxCandidateDomains, retval, seenDomains); } ArrayList<String> allDomains = new ArrayList<String>(); allDomains.addAll(seenDomains.keySet()); // add all domains from recently seen flux domains if (retval.size() < maxCandidateDomains && recentFluxDomains.size() > 0) { addRecentFluxDomains(recentFluxDomains, maxCandidateDomains, retval, seenDomains, allDomains); } // then add the non-recent ones that meet the score threshold if (retval.size() < maxCandidateDomains) { addThresholdMeetingDomains(maxCandidateDomains, goodCandidateThreshold, retval, seenDomains, allDomains); } // then fill the rest randomly from what's left over if (retval.size() < maxCandidateDomains) { Collections.shuffle(allDomains); for (String domainname : allDomains) { if (retval.size() == maxCandidateDomains) { break; } retval.add(seenDomains.get(domainname)); } } return retval; } /** * Copies candidate flux domains into a list if its candidate score is greater * than a threshold up to a limit on the size of the list. The candidate flux * domains are copied from a map of candidate flux domains. Domains are only * considered if they appear in the all domains list. Once a candidate flux * domain is copied it's corresponding domain name is removed from the all * domains list. * * @param maxCandidateDomains the limit on the total number of domains to add * @param goodCandidateThreshold the candidate score threshold * @param resultBuf the list in which to store the candidate flux domains * @param seenDomains the map of candidate flux domains. * @param allDomains this list of domains to consider */ private void addThresholdMeetingDomains(int maxCandidateDomains, double goodCandidateThreshold, List<CandidateFluxDomain> resultBuf, HashMap<String, CandidateFluxDomain> seenDomains, ArrayList<String> allDomains) { ArrayList<CandidateFluxDomain> sortedDomains = new ArrayList<CandidateFluxDomain>(); ArrayList<String> removeDomains = new ArrayList<String>(); // get all cfd's whose score is over the threshold for (String domain : allDomains) { CandidateFluxDomain temp = seenDomains.get(domain); if (this.calcCandidateScore(temp) > goodCandidateThreshold) { sortedDomains.add(temp); } } // sort them in descending order by score Collections.sort(sortedDomains, new Comparator<CandidateFluxDomain>() { @Override public int compare(CandidateFluxDomain o1, CandidateFluxDomain o2) { Double o1score = calcCandidateScore(o1); Double o2score = calcCandidateScore(o2); return o2score.compareTo(o1score); // Descending // order } }); for (CandidateFluxDomain cfd2 : sortedDomains) { if (resultBuf.size() == maxCandidateDomains) { break; } resultBuf.add(cfd2); removeDomains.add(cfd2.getDomainName()); } allDomains.removeAll(removeDomains); } /** * Copies candidate flux domains into a list if its corresponding 2LD is present * in a list of recent flux domains up to a limit on the size of the list. The * candidate flux domains are copied from a map of candidate flux domains. Domains * are only considered if they appear in the all domains list. Once a candidate flux * domain is copied it's corresponding domain name is removed from the all domains list. * * @param recentFluxDomains the list of recent flux 2LD's * @param maxCandidateDomains the limit on the total number of domains to add * @param resultBuf the list in which to store the candidate flux domains * @param seenDomains the map of candidate flux domains. * @param allDomains this list of domains to consider */ private void addRecentFluxDomains(Set<String> recentFluxDomains, int maxCandidateDomains, List<CandidateFluxDomain> resultBuf, HashMap<String, CandidateFluxDomain> seenDomains, ArrayList<String> allDomains) { ArrayList<String> removeDomains = new ArrayList<String>(); Collections.shuffle(allDomains); // this is probably not necessary for (String domainname : allDomains) { if (resultBuf.size() == maxCandidateDomains) { break; } String domainname2LD = DomainNameUtils.extractEffective2LD(domainname); if (domainname2LD != null && recentFluxDomains.contains(domainname2LD)) { resultBuf.add(seenDomains.get(domainname)); removeDomains.add(domainname); } } allDomains.removeAll(removeDomains); } /** * Copies candidate flux domains into a list if they appear in a domain file up * to a limit on the size of the list. The candidate flux domains are copied * from a map of candidate flux domains. Once a candidate flux domain is copied * it is removed from the map. * * @param domainfile the file from which to read the domains * @param maxCandidateDomains the limit on the total number of domains to add * @param resultBuf the list in which to store the candidate flux domains * @param seenDomains the map of candidate flux domains. * @throws IOException */ private void addDomainsFromFile(String domainfile, int maxCandidateDomains, List<CandidateFluxDomain> resultBuf, HashMap<String, CandidateFluxDomain> seenDomains) throws IOException { BufferedReader br = new BufferedReader(new FileReader(new File(domainfile))); String line = null; while ((line = br.readLine()) != null) { if (resultBuf.size() == maxCandidateDomains) { break; } line = DomainNameUtils.stripDots(line.trim()); CandidateFluxDomain d = seenDomains.get(line); if (d != null) { if (log.isDebugEnabled()) { log.debug("Adding domain " + line + " from domains file."); } resultBuf.add(d); seenDomains.remove(line); } else { if (log.isDebugEnabled()) { log.debug("Unable to load domain " + line + " from domains file."); } } } br.close(); } /** * Load recent flux 2LD's. * * @param startTime unix epoch in sec. * @return the list of recent flux 2LD's */ private Set<String> loadRecentFluxDomains(long startTime) { Set<String> retval = new HashSet<String>(); DBInterface iface = DBInterfaceFactory.loadDBInterface(); DateTime startDateTime = new DateTime(startTime * 1000); for (int i = 1; i < 8; i++) { Date prevdate = new Date(startDateTime.minusDays(i).getMillis()); try { for (StoredDomainCluster fluxCluster : iface.getClusters(prevdate, ClusterClass.FLUX)) { for (String domain : fluxCluster.getDomains()) { retval.add(DomainNameUtils.extractEffective2LD(domain)); } } } catch (Exception e) { if (log.isErrorEnabled()) { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd"); log.error("Uable to load previous flux domains for " + dateFormat.format(prevdate), e); } } } return retval; } /** * Gets the names of the data input files from a specific * directory for the time period between the start and end times. * * @param dirPath the data file directory * @param startTime the start time * @param endTime the end time * @return the list of input file names */ private List<String> getFileNames(String dirPath, long startTime, long endTime) { ArrayList<String> retval = new ArrayList<String>(); ArrayList<File> selectedFiles = new ArrayList<File>(); String fileregex = localprops.getProperty(FLUXFILEREGEXKEY); String parseregx = localprops.getProperty(FLUXFILEPARSEREGEXKEY); Pattern parsepattern = Pattern.compile(parseregx); File fluxdir = new File(dirPath); if (fluxdir.isDirectory()) { File[] posFluxFiles = fluxdir.listFiles(); if (posFluxFiles != null) { for (File posFluxFile : posFluxFiles) { if (posFluxFile.getName().matches(fileregex)) { Matcher parsematcher = parsepattern.matcher(posFluxFile.getName()); parsematcher.find(); long timestamp = Long.parseLong(parsematcher.group(0)); if (timestamp >= startTime && timestamp < endTime) { selectedFiles.add(posFluxFile); } } } } } // sorts in ascending order by filename Collections.sort(selectedFiles, new Comparator<File>() { @Override public int compare(File o1, File o2) { return o1.getName().compareTo(o2.getName()); } }); for (File selectedFile : selectedFiles) { if (log.isDebugEnabled()) { log.debug("Loading file: " + selectedFile.getName()); } retval.add(selectedFile.getAbsolutePath()); } return retval; } /** * Runs the clustering process on the data files for the time period * between the start and end times. The linkage type and max cut height * are read from the ClusterGenerator.localprops file * * @param startTime the start time * @param endTime the end time * @param selcfds if true then the file with a list of domains to cluster regardless * of candidate score is used for clustering * @return the list of clusters * @throws Exception if there is an error reading the ClusterGenerator.localprops * or data files */ public List<DomainCluster> generateClusters(long startTime, long endTime, boolean selcfds) throws Exception { if (selcfds) { String selcfdfilepath = appprops.getProperty(SELECTEDCFDFILEKEY); if (new File(selcfdfilepath).exists()) { return this.generateClusters(startTime, endTime, selcfdfilepath); } } return this.generateClusters(startTime, endTime, null); } /** * Runs the clustering process on the data files for the time period * between the start and end times. The linkage type and max cut height * are read from the ClusterGenerator.localprops file * * @param startTime the start time * @param endTime the end time * @param domainfile a list of domains to cluster regardless of candidate * score * @return the list of clusters * @throws Exception if there is an error reading the ClusterGenerator.localprops * or data files */ public List<DomainCluster> generateClusters(long startTime, long endTime, String domainfile) throws Exception { double maxCutHeight = Double.parseDouble(appprops.getProperty(MAXCUTHEIGHTKEY)); String linkageTypeStr = appprops.getProperty(LINKAGETYPEKEY); LinkageType linkage = LinkageType.COMPLETE_LINKAGE; if (linkageTypeStr.toLowerCase().trim().equals("single")) { linkage = LinkageType.SINGLE_LINKAGE; } return this.generateClusters(startTime, endTime, domainfile, linkage, maxCutHeight); } /** * Runs the clustering process on the data files for the time period * between the start and end times. * * @param startTime the start time * @param endTime the end time * @param domainfile a list of domains to cluster regardless of candidate * score * @param linkage the linkage type * @param maxCutHeight the max cut height * @return the list of clusters * @throws Exception if there is an error reading the ClusterGenerator.localprops * or data files */ private List<DomainCluster> generateClusters(long startTime, long endTime, String domainfile, LinkageType linkage, double maxCutHeight) throws Exception { ArrayList<DomainCluster> retval = new ArrayList<DomainCluster>(); if (log.isInfoEnabled()) { log.info(this.getClass().getSimpleName() + " Started: " + Calendar.getInstance().getTime()); log.info("Loading Candidate Flux Domains."); } List<CandidateFluxDomain> cfdList = loadCandidateFluxDomains(startTime, endTime, domainfile); if (log.isInfoEnabled()) { log.info("Loaded " + cfdList.size() + " Candidate Flux Domains."); } if (cfdList.size() > 0) { if (log.isInfoEnabled()) { log.info("Computing Distance Matrix."); } Vector<Float> utDistValues = this.computeDistanceMatrix(cfdList); DistanceMatrix distMatrix = new DistanceMatrix(utDistValues); if (log.isInfoEnabled()) { log.info("Distance Matrix Calculated."); } HierarchicalClustering hc = new HierarchicalClustering(linkage); if (log.isInfoEnabled()) { log.info("Running Clusterer."); } hc.runClusterer(distMatrix, maxCutHeight); if (log.isInfoEnabled()) { log.info("Clustering Completed."); } Dendrogram dgram = hc.getDendrogram(); if (log.isInfoEnabled()) { log.info("Creating Domain Clusters."); } Vector<HCluster> hclusters = dgram.getClusters(maxCutHeight); for (HCluster hcluster : hclusters) { DomainCluster dm = new DomainCluster(); for (int index : hcluster.getIndexes()) { dm.addCandidateFluxDomain(cfdList.get(index)); } retval.add(dm); } if (log.isInfoEnabled()) { for (DomainCluster d : retval) { log.info(d.toString()); } log.info("Created " + retval.size() + " Domain Clusters."); log.info(this.getClass().getSimpleName() + " Finished: " + Calendar.getInstance().getTime()); } } return retval; } /** * Store clusters through a db interface loaded by the DBInterfaceFactory. * * @param clusters the list of clusters to store. * @param log_date the clustering run date * @throws Exception if the database interface could not be loaded. */ public void storeClusters(List<DomainCluster> clusters, Date log_date) throws Exception { DBInterface dbiface = DBInterfaceFactory.loadDBInterface(); if (dbiface == null) { throw new Exception("Could not load DB interface."); } if (log.isInfoEnabled()) { log.info(this.getClass().getSimpleName() + " Started: " + Calendar.getInstance().getTime()); log.info("Storing " + clusters.size() + " Clusters."); } dbiface.initClusterTables(log_date); dbiface.storeClusters(clusters, "SIE", log_date); if (log.isInfoEnabled()) { log.info("Clusters stored."); log.info(this.getClass().getSimpleName() + " Finished: " + Calendar.getInstance().getTime()); } } /** * Prints each each cluster in the list to stdout. * * @param clusters the list clusters to print */ public void printClusters(List<DomainCluster> clusters) { for (DomainCluster cluster : clusters) { System.out.println(cluster); } } }