Java tutorial
package de.tudarmstadt.tk.statistics.importer; /** * Copyright 2014 * Telecooperation (TK) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FileFilter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.filefilter.DirectoryFileFilter; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import au.com.bytecode.opencsv.CSVReader; import de.tudarmstadt.tk.statistics.config.ReportTypes; import de.tudarmstadt.tk.statistics.config.StatsConfig; import de.tudarmstadt.tk.statistics.config.StatsConfigConstants; import de.tudarmstadt.tk.statistics.helper.Helpers; import de.tudarmstadt.tk.statistics.test.SampleData; /** * @author Guckelsberger, Schulz */ public class ExternalResultsReader { private static final Logger logger = LogManager.getLogger("Statistics"); public static void readMUGCTrainTest(String filePath) { String outFileName = "AggregatedTrainTest.csv"; logger.log(Level.INFO, String.format("Importing data from directory %s.", filePath)); // Method requires input directory. Check this condition. File directory = new File(filePath); if (directory.isDirectory()) { System.err.println("Please specify a file. Aborting."); return; } //Empty previous output file, if there was one File outputFile = new File(directory.getParentFile(), outFileName); if (outputFile.exists()) { outputFile.delete(); } try { String header = "Train;Test;Classifier;FeatureSet;Measure;Value"; PrintWriter out = new PrintWriter(new FileWriter(outputFile, true)); out.println(header); out.close(); } catch (IOException e) { System.err.println("Error while writing aggregated Train-Test file."); e.printStackTrace(); } ArrayList<String> outputRows = new ArrayList<String>(); // iterate all rows List<String[]> inputRowsFirstFile = new ArrayList<>(); inputRowsFirstFile = readAndCheckCSV(filePath, ';'); // first: order by train set ArrayList<ExternalResults> extResults = new ArrayList<>(); for (int i = 0; i < inputRowsFirstFile.size(); i++) { ExternalResults results = new ExternalResults(); // identify current train/test split String[] datasetNames = inputRowsFirstFile.get(i)[0].replace("TRAIN:", "").replace("TEST:", "") .split(","); results.trainSetName = datasetNames[0].replace(" ", ""); results.testSetName = datasetNames[1].replace(" ", ""); // set classifier name results.classifierParameters = inputRowsFirstFile.get(i)[1]; // read feature set results.featureSetName = inputRowsFirstFile.get(i)[2]; // read classification results results.recall = Double.parseDouble(inputRowsFirstFile.get(i)[3]); results.fMeasure = Double.parseDouble(inputRowsFirstFile.get(i)[4]); results.precision = Double.parseDouble(inputRowsFirstFile.get(i)[5]); results.accuracy = Double.parseDouble(inputRowsFirstFile.get(i)[10]) / 100; extResults.add(results); } HashMap<String, ArrayList<ExternalResults>> extResultsByTrainTestFeature = new HashMap<>(); // order by test set for (ExternalResults result : extResults) { String IdKey = result.trainSetName + result.testSetName + result.featureSetName; if (extResultsByTrainTestFeature.containsKey(IdKey)) { extResultsByTrainTestFeature.get(IdKey).add(result); } else { extResultsByTrainTestFeature.put(IdKey, new ArrayList<ExternalResults>()); extResultsByTrainTestFeature.get(IdKey).add(result); } } ArrayList<ExternalResults> aggregatedResults = new ArrayList<>(); // aggregate results or keep as are for (Entry<String, ArrayList<ExternalResults>> trainTestSplit : extResultsByTrainTestFeature.entrySet()) { ExternalResults aggrResult = new ExternalResults(); double recall = 0; double fMeasure = 0; double precision = 0; double accuracy = 0; int nrClassifiers = 0; // for all entries that are from the same train/test split and use the same feature set -> aggregate results for (ExternalResults result : trainTestSplit.getValue()) { aggrResult.testSetName = result.testSetName; aggrResult.trainSetName = result.trainSetName; aggrResult.classifierParameters = result.classifierParameters; aggrResult.featureSetName = result.featureSetName; recall += result.recall; fMeasure += result.fMeasure; precision += result.precision; accuracy += result.accuracy; nrClassifiers++; } aggrResult.accuracy = (accuracy / nrClassifiers); aggrResult.fMeasure = (fMeasure / nrClassifiers); aggrResult.recall = (recall / nrClassifiers); aggrResult.precision = (precision / nrClassifiers); aggregatedResults.add(aggrResult); } // write values of measure for (ExternalResults result : aggregatedResults) { String outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Percent Correct", result.accuracy); outputRows.add(outputRow); outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Weighted Precision", result.precision); outputRows.add(outputRow); outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Weighted Recall", result.recall); outputRows.add(outputRow); outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Weighted F-Measure", result.fMeasure); outputRows.add(outputRow); } // Write aggregated data to a new file try { PrintWriter out = new PrintWriter(new FileWriter(outputFile, true)); for (String s : outputRows) { out.println(s); } out.close(); } catch (IOException e) { System.err.println("Error while writing aggregated Train-Test file."); e.printStackTrace(); } logger.log(Level.INFO, String.format("Finished import. The aggregated data was written to %s.", outFileName)); } public static void readMUGCCV(String filePath) { String outFileName = "AggregatedTrainTest.csv"; logger.log(Level.INFO, String.format("Importing data from directory %s.", filePath)); // Method requires input directory. Check this condition. File directory = new File(filePath); if (directory.isDirectory()) { System.err.println("Please specify a file. Aborting."); return; } //Empty previous output file, if there was one File outputFile = new File(directory.getParentFile(), outFileName); if (outputFile.exists()) { outputFile.delete(); } try { String header = "Train;Test;Classifier;FeatureSet;Measure;Value"; PrintWriter out = new PrintWriter(new FileWriter(outputFile, true)); out.println(header); out.close(); } catch (IOException e) { System.err.println("Error while writing aggregated Train-Test file."); e.printStackTrace(); } ArrayList<String> outputRows = new ArrayList<String>(); // iterate all rows List<String[]> inputRowsFirstFile = new ArrayList<>(); inputRowsFirstFile = readAndCheckCSV(filePath, ';'); // first: order by train set ArrayList<ExternalResults> extResults = new ArrayList<>(); for (int i = 0; i < inputRowsFirstFile.size(); i++) { ExternalResults results = new ExternalResults(); // identify current train/test split String[] datasetNames = inputRowsFirstFile.get(i)[0].split(","); results.trainSetName = datasetNames[0].replace("CV: ", "").replace(" ", ""); // set classifier name results.classifierParameters = inputRowsFirstFile.get(i)[1]; // read feature set results.featureSetName = inputRowsFirstFile.get(i)[2]; // read classification results results.recall = Double.parseDouble(inputRowsFirstFile.get(i)[3]); results.fMeasure = Double.parseDouble(inputRowsFirstFile.get(i)[4]); results.precision = Double.parseDouble(inputRowsFirstFile.get(i)[5]); results.accuracy = Double.parseDouble(inputRowsFirstFile.get(i)[10]) / 100; extResults.add(results); } HashMap<String, ArrayList<ExternalResults>> extResultsByTrainTestFeature = new HashMap<>(); // order by test set for (ExternalResults result : extResults) { String IdKey = result.trainSetName + result.testSetName + result.featureSetName; if (extResultsByTrainTestFeature.containsKey(IdKey)) { extResultsByTrainTestFeature.get(IdKey).add(result); } else { extResultsByTrainTestFeature.put(IdKey, new ArrayList<ExternalResults>()); extResultsByTrainTestFeature.get(IdKey).add(result); } } ArrayList<ExternalResults> aggregatedResults = new ArrayList<>(); // aggregate results or keep as are for (Entry<String, ArrayList<ExternalResults>> trainTestSplit : extResultsByTrainTestFeature.entrySet()) { ExternalResults aggrResult = new ExternalResults(); double recall = 0; double fMeasure = 0; double precision = 0; double accuracy = 0; int nrClassifiers = 0; // for all entries that are from the same train/test split and use the same feature set -> aggregate results for (ExternalResults result : trainTestSplit.getValue()) { aggrResult.testSetName = result.testSetName; aggrResult.trainSetName = result.trainSetName; aggrResult.classifierParameters = result.classifierParameters; aggrResult.featureSetName = result.featureSetName; recall += result.recall; fMeasure += result.fMeasure; precision += result.precision; accuracy += result.accuracy; nrClassifiers++; } aggrResult.accuracy = (accuracy / nrClassifiers); aggrResult.fMeasure = (fMeasure / nrClassifiers); aggrResult.recall = (recall / nrClassifiers); aggrResult.precision = (precision / nrClassifiers); aggregatedResults.add(aggrResult); } // write values of measure for (ExternalResults result : aggregatedResults) { String outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Percent Correct", result.accuracy); outputRows.add(outputRow); outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Weighted Precision", result.precision); outputRows.add(outputRow); outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Weighted Recall", result.recall); outputRows.add(outputRow); outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0", result.featureSetName, "Weighted F-Measure", result.fMeasure); outputRows.add(outputRow); } // Write aggregated data to a new file try { PrintWriter out = new PrintWriter(new FileWriter(outputFile, true)); for (String s : outputRows) { out.println(s); } out.close(); } catch (IOException e) { System.err.println("Error while writing aggregated Train-Test file."); e.printStackTrace(); } logger.log(Level.INFO, String.format("Finished import. The aggregated data was written to %s.", outFileName)); } public static void readLODPipelineTrainTest(String pathToDirectory) { Locale.setDefault(Locale.ENGLISH); String[] semanticFeatures = new String[] { "Baseline", "+ALL", "+LOC", "+TIME", "+LOD", "+LOC+TIME", "+LOC+LOD", "+TIME+LOD", "+TYPES", "+CAT" }; String[] measures = new String[] { "Percent Correct", "Weighted Precision", "Weighted Recall", "Weighted F-Measure" }; String outFileName = "AggregatedCVRandom.csv"; logger.log(Level.INFO, String.format("Importing data from directory %s.", pathToDirectory)); // Method requires input directory. Check this condition. File directory = new File(pathToDirectory); if (!directory.isDirectory()) { System.err.println("Please specify a directory with the source .csv files. Aborting."); return; } //Empty previous output file, if there was one File outputFile = new File(directory, outFileName); if (outputFile.exists()) { outputFile.delete(); } try { String header = "Train;Test;Classifier;FeatureSet;Measure;Value"; PrintWriter out = new PrintWriter(new FileWriter(outputFile, true)); out.println(header); out.close(); } catch (IOException e) { System.err.println("Error while writing aggregated Train-Test file."); e.printStackTrace(); } // prepare files lists HashMap<String, ArrayList<File>> filesMap = new HashMap<>(); // read all subdirectories that match the city names File[] subdirs = directory.listFiles((FileFilter) DirectoryFileFilter.DIRECTORY); //Iterate all subdirectories for (File subDirectory : subdirs) { // get train set name String trainSetName = subDirectory.getName(); // iterate all files in directory File[] filesInDirectory = subDirectory.listFiles(); List<File> fileList = Arrays.asList(filesInDirectory); for (File subDirFile : fileList) { // get name of test data set String[] filenameTokens = subDirFile.getName().split("To"); //String testDataName = filenameTokens[1].substring(0, filenameTokens[1].length() - 11); String testDataName; // if only this string is left, then CV if (filenameTokens[1].equals("Results.csv")) { testDataName = trainSetName; } else { testDataName = filenameTokens[1].split("Results.csv")[0]; testDataName = testDataName.split("2C.csv|4C.csv|.csv")[0]; } // put current file to test data name -> this way all files // corresponding to the same test set are in one map if (filesMap.get(testDataName) != null) { // get existing list and add file ArrayList<File> currentFileList = filesMap.get(testDataName); currentFileList.add(subDirFile); } else { // create new list and add current file ArrayList<File> newFileList = new ArrayList<>(); newFileList.add(subDirFile); filesMap.put(testDataName, newFileList); } } ArrayList<String> outputRows = new ArrayList<String>(); int nrDifferentClassifiers = 0; // iterate all files of one map Iterator<Entry<String, ArrayList<File>>> it = filesMap.entrySet().iterator(); while (it.hasNext()) { Map.Entry pairs = (Map.Entry) it.next(); String testSetName = (String) pairs.getKey(); ArrayList<File> testFiles = (ArrayList<File>) pairs.getValue(); nrDifferentClassifiers = testFiles.size(); // initialize data store ArrayList<HashMap<String, Object>> values = new ArrayList<>(); // get rows for first file to initialize store List<String[]> inputRowsFirstFile = readAndCheckCSV(testFiles.get(0).getAbsolutePath(), ';'); for (int i = 0; i < inputRowsFirstFile.size(); i++) { HashMap<String, Object> currentRowValues = new HashMap<>(); currentRowValues.put("semanticFeature", ""); currentRowValues.put("classifierParameters", ""); currentRowValues.put("aggregatedMeasureValues", new double[measures.length]); currentRowValues.put("nGrams", ""); values.add(currentRowValues); } // get results from other files for (File testFile : testFiles) { // Only analyse files with .csv extension if (!FilenameUtils.getExtension(testFile.getName().toLowerCase()).equals("csv") || testFile.getName().equals("AggregatedTrainTest.csv")) { continue; } // check file for consistency List<String[]> inputRows = readAndCheckCSV(testFile.getAbsolutePath(), ';'); // check if length matches first file if (!(inputRows.size() == values.size())) { // TODO error message } else { for (int i = 0; i < inputRows.size(); i++) { String[] inputCells = inputRows.get(i); // read current values and compare with entries String semanticFeature = semanticFeatures[i % semanticFeatures.length]; if (values.get(i).get("semanticFeature") == "") { values.get(i).put("semanticFeature", semanticFeature); } else { if (values.get(i).get("semanticFeature").equals(semanticFeature) == false) { System.err.println("Semantic Features do not match."); System.exit(1); } } // needs rework as we do aggregation here // String classifierParameters = inputCells[0]; // // if (values.get(i).get("classifierParameters") == // "") // { // values.get(i).put("classifierParameters", // classifierParameters); // } // else // { // if // (values.get(i).get("classifierParameters").equals(classifierParameters) // == false) // { // System.err.println("Classifier parameters do not match."); // System.exit(1); // } // } String nGrams = inputCells[12]; if (values.get(i).get("nGrams") == "") { values.get(i).put("nGrams", nGrams); } else { if (values.get(i).get("nGrams").equals(nGrams) == false) { System.err.println("N Gram Length does not match."); System.exit(1); } } // get and aggregate values for (int j = 0; j < measures.length; j++) { if (j == 0) { //double currentValue = ((double[]) values.get(i).get("aggregatedMeasureValues"))[j]; double valueInFile = Double.parseDouble(inputCells[j + 16]) / 100; ((double[]) values.get(i).get("aggregatedMeasureValues"))[j] += valueInFile; } else { //double currentValue = ((double[]) values.get(i).get("aggregatedMeasureValues"))[j]; double valueInFile = Double.parseDouble(inputCells[j + 16]); ((double[]) values.get(i).get("aggregatedMeasureValues"))[j] += valueInFile; } } } } } // write aggregated results to file for (HashMap<String, Object> currentValues : values) { String semFeature = (String) currentValues.get("semanticFeature"); String nGrams = (String) currentValues.get("nGrams"); String featureSet = String.format("%s, nGrams: %s", semFeature, nGrams); for (int j = 0; j < measures.length; j++) { String outputRow = String.format("%s;%s;%s;%s;%s;%f", trainSetName, testSetName, "0", featureSet, measures[j], ((double[]) currentValues.get("aggregatedMeasureValues"))[j] / nrDifferentClassifiers); outputRows.add(outputRow); } } // avoids a ConcurrentModificationException it.remove(); } // Write aggregated data to a new file try { PrintWriter out = new PrintWriter(new FileWriter(outputFile, true)); for (String s : outputRows) { out.println(s); } out.close(); } catch (IOException e) { System.err.println("Error while writing aggregated Train-Test file."); e.printStackTrace(); } } logger.log(Level.INFO, String.format("Finished import. The aggregated data was written to %s.", outFileName)); } public static SampleData interpretCSV(StatsConfig config, List<String[]> rows, ReportTypes pipelineType, HashMap<String, Integer> pipelineMetadata) { HashMap<Integer, ArrayList<ArrayList<Double>>> samplesPerMeasure = new HashMap<Integer, ArrayList<ArrayList<Double>>>(); //Only remove first line if it is a header line if (rows.size() > 0 && rows.get(0)[6].equals("IsBaseline")) { rows.remove(0); } if (rows.size() > 1) { logger.log(Level.INFO, "Extracting samples and metadata from imported data."); int selectBestN = config.getSelectBestN(); String selectByMeasure = config.getSelectByMeasure(); // Preprocessing: Parse different models (classifier + feature set column) and measures ArrayList<String> measures = new ArrayList<String>(); ArrayList<Pair<String, String>> datasets = new ArrayList<Pair<String, String>>(); ArrayList<Pair<String, String>> models = new ArrayList<Pair<String, String>>(); ArrayList<Pair<String, String>> baselineModels = new ArrayList<Pair<String, String>>(); for (int i = 0; i < rows.size(); i++) { String[] columns = rows.get(i); String classifier = columns[2]; if (classifier.equals("0")) { classifier = "Aggregated"; } String featureSets = columns[3]; Pair<String, String> model = Pair.of(classifier, featureSets); if (!models.contains(model)) { models.add(model); if (!baselineModels.contains(model) && Integer.parseInt(columns[6]) == 1) { baselineModels.add(model); } } if (!measures.contains(columns[4])) { measures.add(columns[4]); } } //Check: Baseline only allowed when > 2 models are evaluated if (models.size() <= 2 && baselineModels.size() > 0) { logger.log(Level.WARN, "At least three models are required to make an evaluation against a baseline meaningful. In the dataset, a baseline was specified for only two models. The baseline indicator will be ignored."); System.err.println( "At least three models are required to make an evaluation against a baseline meaningful. In the dataset, a baseline was specified for only two models. The baseline indicator will be ignored."); baselineModels.clear(); } // Now sort samples according to data Collections.sort(rows, new Helpers.LexicographicArrayComparator()); for (int i = 0; i < rows.size(); i++) { String[] columns = rows.get(i); Pair<String, String> data = null; String trainData = columns[0].trim(); String testData = columns[1].trim(); //If this is a CV, numbers after a dot indicate fold UUIDS, they thus have to be splitted to retain the original dataset name if (pipelineType == ReportTypes.CV) { trainData = trainData.split("\\.")[0]; testData = testData.split("\\.")[0]; } if (trainData.equals(testData)) { data = Pair.of(trainData, null); } else { //columns[1] = columns[1].split(".")[0]; data = Pair.of(trainData, testData); } if (!datasets.contains(data)) { datasets.add(data); } } // Preprocessing: Initialize sample container per measure/model for (int i = 0; i < measures.size(); i++) { ArrayList<ArrayList<Double>> samplesPerModel = new ArrayList<ArrayList<Double>>(); for (int j = 0; j < models.size(); j++) { samplesPerModel.add(new ArrayList<Double>()); } samplesPerMeasure.put(i, samplesPerModel); } // Assign samples to different models for (int i = 0; i < rows.size(); i++) { String[] columns = rows.get(i); String classifier = columns[2]; if (classifier.equals("0")) { classifier = "Aggregated"; } String featureSet = columns[3]; String measure = columns[4]; double value = Double.parseDouble(columns[5]); int measureIndex = measures.indexOf(measure); int modelIndex = models.indexOf(Pair.of(classifier, featureSet)); ArrayList<ArrayList<Double>> sPMeasure = samplesPerMeasure.get(measureIndex); sPMeasure.get(modelIndex).add(value); } // Transform into data format required by the statistical evaluation HashMap<String, ArrayList<ArrayList<Double>>> indexedSamples = new HashMap<String, ArrayList<ArrayList<Double>>>(); HashMap<String, ArrayList<Double>> indexedSamplesAverage = new HashMap<String, ArrayList<Double>>(); Iterator<Integer> it = samplesPerMeasure.keySet().iterator(); while (it.hasNext()) { int measureIndex = it.next(); ArrayList<ArrayList<Double>> samplesPerModel = samplesPerMeasure.get(measureIndex); ArrayList<Double> sampleAverages = new ArrayList<Double>(models.size()); for (int modelIndex = 0; modelIndex < models.size(); modelIndex++) { ArrayList<Double> sample = samplesPerModel.get(modelIndex); double average = 0; for (int j = 0; j < sample.size(); j++) { average += sample.get(j); } average /= sample.size(); sampleAverages.add(average); } indexedSamplesAverage.put(measures.get(measureIndex), sampleAverages); indexedSamples.put(measures.get(measureIndex), samplesPerMeasure.get(measureIndex)); } // Check if data fulfills general requirements: > 5 samples for each model, same number of samples per model it = samplesPerMeasure.keySet().iterator(); while (it.hasNext()) { Integer measureIndex = it.next(); ArrayList<ArrayList<Double>> samplesPerModel = samplesPerMeasure.get(measureIndex); int s = samplesPerModel.get(0).size(); for (int i = 1; i < samplesPerModel.size(); i++) { if (samplesPerModel.get(i).size() < 5) { logger.log(Level.ERROR, "More than 5 samples are needed per model and measure. Aborting."); System.err.println("More than 5 samples are needed per model and measure. Aborting."); System.exit(1); } if (samplesPerModel.get(i).size() != s) { logger.log(Level.ERROR, "Different models are not represented by the same number of samples. Aborting."); System.err.println( "Different models are not represented by the same number of samples. Aborting."); System.exit(1); } } } // Collect remaining data required for creating a SampleData object // Check if data fulfills requirements of the specific PipelineTypes int nFolds = 1; int nRepetitions = 1; switch (pipelineType) { case CV: if (datasets.size() > 1) { System.err.println( "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation."); logger.log(Level.ERROR, "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation."); return null; } else if (datasets.get(0).getValue() != null) { System.err.println( "Input data corrupted. Training and Test dataset must be same for Cross-Validation."); logger.log(Level.ERROR, "Input data corrupted. Training and Test dataset must be same for Cross-Validation."); return null; } nFolds = indexedSamples.get(measures.get(0)).get(0).size(); nRepetitions = 1; break; case MULTIPLE_CV: if (datasets.size() > 1) { System.err.println( "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation."); logger.log(Level.ERROR, "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation."); return null; } else if (datasets.get(0).getValue() != null) { System.err.println( "Input data corrupted. Training and Test dataset must be same for Cross-Validation."); logger.log(Level.ERROR, "Input data corrupted. Training and Test dataset must be same for Cross-Validation."); return null; } nFolds = pipelineMetadata.get("nFolds"); nRepetitions = indexedSamples.get(measures.get(0)).get(0).size(); break; case CV_DATASET_LVL: nFolds = pipelineMetadata.get("nFolds"); nRepetitions = 1; break; case MULTIPLE_CV_DATASET_LVL: nFolds = pipelineMetadata.get("nFolds"); nRepetitions = pipelineMetadata.get("nRepetitions"); break; case TRAIN_TEST_DATASET_LVL: nFolds = 1; nRepetitions = 1; break; default: System.err.println("Unknown PipelineType. Aborting."); logger.log(Level.ERROR, "Unknown PipelineType. Aborting."); return null; } //Reorder data in case of a baseline evaluation (baseline first) if (baselineModels.size() == 1) { Pair<String, String> baselineModel = baselineModels.get(0); int modelIndex = models.indexOf(baselineModel); models.remove(modelIndex); models.add(0, baselineModel); for (String measure : indexedSamples.keySet()) { ArrayList<Double> s = indexedSamples.get(measure).get(modelIndex); indexedSamples.get(measure).remove(modelIndex); indexedSamples.get(measure).add(0, s); double a = indexedSamplesAverage.get(measure).get(modelIndex); indexedSamplesAverage.get(measure).remove(modelIndex); indexedSamplesAverage.get(measure).add(0, a); } } SampleData sampleData = new SampleData(null, indexedSamples, indexedSamplesAverage, datasets, models, baselineModels, pipelineType, nFolds, nRepetitions); sampleData = Helpers.truncateData(sampleData, selectBestN, selectByMeasure); return sampleData; } return null; } public static List<SampleData> splitData(SampleData data, StatsConfig config) { List<SampleData> splitted = new ArrayList<SampleData>(); //Use lists instead of sets to maintain order of model metadata ArrayList<String> featureSets = new ArrayList<String>(); ArrayList<String> classifiers = new ArrayList<String>(); for (Pair<String, String> metadata : data.getModelMetadata()) { if (!classifiers.contains(metadata.getLeft())) { classifiers.add(metadata.getLeft()); } if (!featureSets.contains(metadata.getRight())) { featureSets.add(metadata.getRight()); } } //Only separate data if there's more than one independent variable if (!(featureSets.size() > 1 && classifiers.size() > 1)) { splitted.add(data); return splitted; } List<String> it = (config .getFixIndependentVariable() == StatsConfigConstants.INDEPENDENT_VARIABLES_VALUES.Classifier) ? classifiers : featureSets; for (String fixed : it) { ArrayList<Pair<String, String>> modelMetadata = new ArrayList<Pair<String, String>>(); HashMap<String, ArrayList<ArrayList<Double>>> samples = new HashMap<String, ArrayList<ArrayList<Double>>>(); HashMap<String, ArrayList<Double>> sampleAverages = new HashMap<String, ArrayList<Double>>(); for (int i = 0; i < data.getModelMetadata().size(); i++) { Pair<String, String> model = data.getModelMetadata().get(i); boolean eq = (config .getFixIndependentVariable() == StatsConfigConstants.INDEPENDENT_VARIABLES_VALUES.Classifier) ? model.getLeft().equals(fixed) : model.getRight().equals(fixed); if (eq) { modelMetadata.add(model); for (String measure : data.getSamples().keySet()) { if (!samples.containsKey(measure)) { samples.put(measure, new ArrayList<ArrayList<Double>>()); sampleAverages.put(measure, new ArrayList<Double>()); } samples.get(measure).add(data.getSamples().get(measure).get(i)); sampleAverages.get(measure).add(data.getSamplesAverage().get(measure).get(i)); } } } ArrayList<Pair<String, String>> baselineModelData = new ArrayList<Pair<String, String>>(); if (data.isBaselineEvaluation()) { Pair<String, String> baselineModel = null; for (int i = 0; i < data.getBaselineModelMetadata().size(); i++) { boolean eq = (config .getFixIndependentVariable() == StatsConfigConstants.INDEPENDENT_VARIABLES_VALUES.Classifier) ? data.getBaselineModelMetadata().get(i).getLeft().equals(fixed) : data.getBaselineModelMetadata().get(i).getRight().equals(fixed); if (eq) { baselineModel = data.getBaselineModelMetadata().get(i); break; } } if (baselineModel != null) { baselineModelData.add(baselineModel); int modelIndex = modelMetadata.indexOf(baselineModel); modelMetadata.remove(modelIndex); modelMetadata.add(0, baselineModel); for (String measure : data.getSamples().keySet()) { ArrayList<Double> s = samples.get(measure).get(modelIndex); samples.get(measure).remove(modelIndex); samples.get(measure).add(0, s); double a = sampleAverages.get(measure).get(modelIndex); sampleAverages.get(measure).remove(modelIndex); sampleAverages.get(measure).add(0, a); } } else { logger.log(Level.ERROR, "Missing baseline model! Please check if baseline indicators are set correctly in the input file, and if they correspond correctly to the fixIndependentVariable property in the configuration. In case of both varying feature sets and classifiers, baseline indicators have to be set multiple times."); System.err.println( "Missing baseline model! Please check if baseline indicators are set correctly in the input file, and if they correspond correctly to the fixIndependentVariable property in the configuration. In case of both varying feature sets and classifiers, baseline indicators have to be set multiple times."); System.exit(1); } } SampleData newData = new SampleData(null, samples, sampleAverages, data.getDatasetNames(), modelMetadata, baselineModelData, data.getPipelineType(), data.getnFolds(), data.getnRepetitions()); splitted.add(newData); } return splitted; } /** * Read csv file, split each line by the specified separator and check * whether each line can be split into the same number of columns * * @param pathToCsvFile the path to the .csv file * @param separator the separator to be used to split a line in separate cells, each relating to one column ArrayList<String[]> containing all lines split into tokens */ public static List<String[]> readAndCheckCSV(String pathToCsvFile, char separator) { List<String[]> rows = new ArrayList<String[]>(); try { CSVReader reader = new CSVReader(new FileReader(pathToCsvFile), separator); rows = reader.readAll(); reader.close(); if (rows.size() > 0) { for (String[] row : rows) { if (row.length != rows.get(0).length) { logger.log(Level.ERROR, ".csv file corrupt: number of columns not same for each row."); System.err.println(".csv file corrupt: number of columns not same for each row."); System.exit(1); } if (row.length != 7) { logger.log(Level.ERROR, ".csv file corrupt: must contain exactly 7 columns."); System.err.println(".csv file corrupt: must contain exactly 7 columns."); System.exit(1); } } } } catch (FileNotFoundException e) { logger.log(Level.ERROR, "Input .csv file not found!"); System.err.println("Input .csv file not found!"); System.exit(1); } catch (IOException e) { logger.log(Level.ERROR, "Exception while reading input data .csv!"); System.err.println("Exception while reading input data .csv!"); e.printStackTrace(); System.exit(1); } return rows; } }