Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package DatasetCreation; import FeatureExtraction.IFeatureExtractor; import FeatureRepresentation.FeatureRepresentor.FeatureRepresentation; import Framework.DBFramework.Classification; import IO.Console; import IO.FileWriter; import Math.MathCalc; import java.util.ArrayList; import java.util.Map; import javafx.util.Pair; import org.apache.commons.lang3.StringUtils; /** * * @author Aviad */ public class DatasetCSVBuilder<T> { /** * Return CSV string which represent the dataset * * @param elements ArrayList of elements to build dataset from * @param featureExtractor a Feature Extractor object * @param selectedFeatures the top selected features to build the dataset * with * @param classification the classification of all the records in the * dataset * @param addElementIDColumn add prefix column identifying the record * @param addClassificationColumn add suffix column identifying the class of * the record * @return CSV string which represent the dataset */ public StringBuilder BuildDatabaseCSV(ArrayList<T> elements, IFeatureExtractor<T> featureExtractor, ArrayList<Pair<String, Integer>> selectedFeatures, int totalElementsNum, FeatureRepresentation featureRepresentation, Classification classification, boolean addElementIDColumn, boolean addClassificationColumn) { StringBuilder datasetCSV = new StringBuilder(); StringBuilder elementFeaturesVectorCSV; for (T element : elements) { elementFeaturesVectorCSV = GetFeaturesVectorCSV(element, featureExtractor, selectedFeatures, totalElementsNum, featureRepresentation, classification, addElementIDColumn, addClassificationColumn); if (elementFeaturesVectorCSV != null) { datasetCSV.append(elementFeaturesVectorCSV); datasetCSV.append("\n"); } } datasetCSV.deleteCharAt(datasetCSV.lastIndexOf("\n")); return datasetCSV; } /** * Return CSV string which represent the element features vector * * @param element the element to extract the features from * @param featureExtractor a Feature Extractor object * @param selectedFeatures the top selected features to build the dataset * with * @param classification the classification of given element dataset * @param addElementIDColumn add prefix column identifying the record * @param addClassificationColumn add suffix column identifying the class of * the record * @return CSV string which represent the element features vector */ public StringBuilder GetFeaturesVectorCSV(T element, IFeatureExtractor<T> featureExtractor, ArrayList<Pair<String, Integer>> selectedFeatures, int totalElementsNum, FeatureRepresentation featureRepresentation, Classification classification, boolean addElementIDColumn, boolean addClassificationColumn) { Map<String, Integer> elementFeaturesFrequencies = featureExtractor .ExtractFeaturesFrequencyFromSingleElement(element); if (elementFeaturesFrequencies.size() > 0) { StringBuilder featuresVectorCSV = new StringBuilder(); if (addElementIDColumn) { featuresVectorCSV.append(element.toString()).append(","); } int mostCommonFeatureFrequencyInElement = GetMostCommonSelectedFeatureFrequencyInElement( elementFeaturesFrequencies, selectedFeatures); String selectedFeature; int featureFrequencyInElement; int numOfElementsContainTheFeature; double TFIDF; String cellValue = ""; for (Pair<String, Integer> selectedFeaturePair : selectedFeatures) { selectedFeature = selectedFeaturePair.getKey(); switch (featureRepresentation) { case Binary: if (elementFeaturesFrequencies.containsKey(selectedFeature)) { cellValue = 1 + ""; } else { cellValue = 0 + ""; } break; case TFIDF: numOfElementsContainTheFeature = selectedFeaturePair.getValue(); featureFrequencyInElement = (elementFeaturesFrequencies.containsKey(selectedFeature)) ? elementFeaturesFrequencies.get(selectedFeature) : 0; TFIDF = MathCalc.GetTFIDF(featureFrequencyInElement, mostCommonFeatureFrequencyInElement, totalElementsNum, numOfElementsContainTheFeature); TFIDF = MathCalc.Round(TFIDF, 3); cellValue = TFIDF + ""; break; } featuresVectorCSV.append(cellValue).append(","); } if (addClassificationColumn) { featuresVectorCSV.append(classification.toString()); } else { featuresVectorCSV.deleteCharAt(featuresVectorCSV.length() - 1); } return featuresVectorCSV; } else { return null; } } /** * Return the frequency of the most common (selected) feature in Element * * @param elementFeaturesFrequencies features frequencies in element * @param selectedFeatures the top selected features to build the dataset * with * @return the frequency of the most common (selected) feature in Element */ private static int GetMostCommonSelectedFeatureFrequencyInElement( Map<String, Integer> elementFeaturesFrequencies, ArrayList<Pair<String, Integer>> selectedFeatures) { //To find the value of the most common feature from the selected features int numOfOccurrencesOfMostCommonFeature = 0; String selectedFeature; int selectedFeatureValue; for (Pair<String, Integer> selectedFeaturePair : selectedFeatures) { selectedFeature = selectedFeaturePair.getKey(); if (elementFeaturesFrequencies.containsKey(selectedFeature)) { selectedFeatureValue = elementFeaturesFrequencies.get(selectedFeature); if (selectedFeatureValue > numOfOccurrencesOfMostCommonFeature) { numOfOccurrencesOfMostCommonFeature = selectedFeatureValue; } } } return numOfOccurrencesOfMostCommonFeature; } /** * Return CSV string which represent the header row of the dataset * * @param selectedFeaturesNum the top selected features number to build the * dataset with * @param addElementIDColumn add prefix column identifying the record * @param addClassificationColumn add suffix column identifying the class of * the record * @return CSV string which represent the header row of the dataset */ public static StringBuilder GetDatasetHeaderCSV(int selectedFeaturesNum, boolean addElementIDColumn, boolean addClassificationColumn) { StringBuilder datasetHeaderCSV = new StringBuilder(); if (addElementIDColumn) { datasetHeaderCSV.append("Element,"); } for (int i = 1; i <= selectedFeaturesNum; i++) { datasetHeaderCSV.append(String.format("f%s,", i)); } if (addClassificationColumn) { datasetHeaderCSV.append("Class"); } else { //To remove the last feature "," datasetHeaderCSV.deleteCharAt(datasetHeaderCSV.length() - 1); } return datasetHeaderCSV; } /** * Generate Top datasets from the given original CSV dataset * * @param originalCSVDataset the original dataset to extract top X features * from * @param destinationFolder the destination folder to write the datasets * @param tops top X datasets to build * @param datasetFilename the dataset filename * @param elementIDColumnExist is column identifying the record exists * @param classificationColumnExist is column identifying the class of the * record exists */ public static void GenerateTopDatasets(StringBuilder originalCSVDataset, ArrayList<Integer> tops, String destinationFolder, String datasetFilename, boolean elementIDColumnExist, boolean classificationColumnExist) { String topDataset; String destinationFile; char letter = 'a'; for (Integer top : tops) { Console.PrintLine(String.format("Dataset Top %s generated!", top)); destinationFile = String.format("%s_%s_Top(%s).csv", datasetFilename, letter, top); topDataset = GetTopXDataset(originalCSVDataset, top, elementIDColumnExist, classificationColumnExist); FileWriter.WriteFile(topDataset, destinationFolder + "\\" + destinationFile); letter = (char) (((int) letter) + 1); } } /** * Return CSV string of the top X features from the given dataset * * @param originalCSVDataset the original dataset to extract top X features * from * @param topX top X features to extract * @return CSV string of the top X features from the given dataset */ public static String GetTopXDataset(StringBuilder originalCSVDataset, int topX, boolean elementIDColumnExist, boolean classificationColumnExist) { StringBuilder newCSVDatabase = new StringBuilder(); String[] lines = originalCSVDataset.toString().split("\n"); int originalFeaturesCount = lines[0].split(",").length + ((elementIDColumnExist) ? -1 : 0) + ((classificationColumnExist) ? -1 : 0); if (originalFeaturesCount >= topX) { String newLine = ""; for (String line : lines) { if (!line.equals("")) { newLine = GetTopXCSVLine(line, topX, elementIDColumnExist, classificationColumnExist); newCSVDatabase.append(newLine).append("\n"); } } } else { Console.PrintLine(String.format("Requested top %s features out of %s!", topX, originalFeaturesCount)); } return newCSVDatabase.toString(); } /** * Return CSV line of the top X features from the given dataset line * * @param csvLine the original dataset CSV line to extract top X features * from * @param topX top X features to extract * @return CSV line of the top X features from the given dataset line */ private static String GetTopXCSVLine(String csvLine, int topX, boolean elementIDColumnExist, boolean classificationColumnExist) { int indexOfLastTop = StringUtils.ordinalIndexOf(csvLine, ",", topX + ((elementIDColumnExist) ? 1 : 0)); String topFeatures = csvLine.substring(0, indexOfLastTop + 1); String classColumn = ""; if (classificationColumnExist) { int indexOdfirstClassColumn = csvLine.lastIndexOf(",") + 1; classColumn = csvLine.substring(indexOdfirstClassColumn, csvLine.length()); } return topFeatures + classColumn; } /** * return CSV string contain list of features and their document frequencies * * @param featuresDocumentFrequencies features document frequencies selected * features file to * @return StringBuilder */ public static StringBuilder GetFeaturesDocumentFrequenciesCSV(Map<String, int[]> featuresDocumentFrequencies) { String seperator = "|"; StringBuilder sb = new StringBuilder(); sb.append("Features").append(seperator).append("Benign").append(seperator).append("Malicious").append("\n"); int[] value; for (Map.Entry<String, int[]> entry : featuresDocumentFrequencies.entrySet()) { value = entry.getValue(); sb.append(entry.getKey()).append(seperator).append(value[0]).append(seperator).append(value[1]) .append("\n"); } return sb; } /** * Print CSV string contain list of selected features * * @param selectedFeatures ArrayList of selected features selected features * @param featuresDocumentFrequencies all features document frequencies * (Benign, Malicious) * @return StringBuilder */ public static StringBuilder GetSelectedFeaturesCSV(ArrayList<Pair<String, Integer>> selectedFeatures, Map<String, int[]> featuresDocumentFrequencies) { StringBuilder sb = new StringBuilder(); sb.append("#,Feature,InBenignFiles,InMaliciousFiles,Total\n"); Pair pair; String feature; int[] benignMaliciousFrequencies; for (int i = 0; i < selectedFeatures.size(); i++) { pair = selectedFeatures.get(i); feature = pair.getKey().toString(); benignMaliciousFrequencies = featuresDocumentFrequencies.get(feature); sb.append(String.format("f%s,%s,%s,%s,%s", i + 1, feature, benignMaliciousFrequencies[0], benignMaliciousFrequencies[1], pair.getValue())).append("\n"); } return sb; } }