de.tudarmstadt.tk.statistics.importer.ExternalResultsReader.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.tk.statistics.importer.ExternalResultsReader.java

Source

package de.tudarmstadt.tk.statistics.importer;

/**
 * Copyright 2014
 * Telecooperation (TK) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.DirectoryFileFilter;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import au.com.bytecode.opencsv.CSVReader;
import de.tudarmstadt.tk.statistics.config.ReportTypes;
import de.tudarmstadt.tk.statistics.config.StatsConfig;
import de.tudarmstadt.tk.statistics.config.StatsConfigConstants;
import de.tudarmstadt.tk.statistics.helper.Helpers;
import de.tudarmstadt.tk.statistics.test.SampleData;

/**
 * @author Guckelsberger, Schulz
 */
public class ExternalResultsReader {

    private static final Logger logger = LogManager.getLogger("Statistics");

    public static void readMUGCTrainTest(String filePath) {
        String outFileName = "AggregatedTrainTest.csv";

        logger.log(Level.INFO, String.format("Importing data from directory %s.", filePath));

        // Method requires input directory. Check this condition.
        File directory = new File(filePath);
        if (directory.isDirectory()) {
            System.err.println("Please specify a file. Aborting.");
            return;
        }

        //Empty previous output file, if there was one
        File outputFile = new File(directory.getParentFile(), outFileName);
        if (outputFile.exists()) {
            outputFile.delete();
        }
        try {
            String header = "Train;Test;Classifier;FeatureSet;Measure;Value";

            PrintWriter out = new PrintWriter(new FileWriter(outputFile, true));
            out.println(header);
            out.close();
        } catch (IOException e) {
            System.err.println("Error while writing aggregated Train-Test file.");
            e.printStackTrace();
        }

        ArrayList<String> outputRows = new ArrayList<String>();

        // iterate all rows
        List<String[]> inputRowsFirstFile = new ArrayList<>();
        inputRowsFirstFile = readAndCheckCSV(filePath, ';');

        // first: order by train set
        ArrayList<ExternalResults> extResults = new ArrayList<>();

        for (int i = 0; i < inputRowsFirstFile.size(); i++) {
            ExternalResults results = new ExternalResults();

            // identify current train/test split
            String[] datasetNames = inputRowsFirstFile.get(i)[0].replace("TRAIN:", "").replace("TEST:", "")
                    .split(",");
            results.trainSetName = datasetNames[0].replace(" ", "");
            results.testSetName = datasetNames[1].replace(" ", "");

            // set classifier name
            results.classifierParameters = inputRowsFirstFile.get(i)[1];

            // read feature set
            results.featureSetName = inputRowsFirstFile.get(i)[2];

            // read classification results
            results.recall = Double.parseDouble(inputRowsFirstFile.get(i)[3]);
            results.fMeasure = Double.parseDouble(inputRowsFirstFile.get(i)[4]);
            results.precision = Double.parseDouble(inputRowsFirstFile.get(i)[5]);
            results.accuracy = Double.parseDouble(inputRowsFirstFile.get(i)[10]) / 100;

            extResults.add(results);
        }

        HashMap<String, ArrayList<ExternalResults>> extResultsByTrainTestFeature = new HashMap<>();

        // order by test set
        for (ExternalResults result : extResults) {
            String IdKey = result.trainSetName + result.testSetName + result.featureSetName;

            if (extResultsByTrainTestFeature.containsKey(IdKey)) {
                extResultsByTrainTestFeature.get(IdKey).add(result);
            } else {
                extResultsByTrainTestFeature.put(IdKey, new ArrayList<ExternalResults>());
                extResultsByTrainTestFeature.get(IdKey).add(result);
            }
        }

        ArrayList<ExternalResults> aggregatedResults = new ArrayList<>();

        // aggregate results or keep as are
        for (Entry<String, ArrayList<ExternalResults>> trainTestSplit : extResultsByTrainTestFeature.entrySet()) {
            ExternalResults aggrResult = new ExternalResults();

            double recall = 0;
            double fMeasure = 0;
            double precision = 0;
            double accuracy = 0;
            int nrClassifiers = 0;

            // for all entries that are from the same train/test split and use the same feature set -> aggregate results
            for (ExternalResults result : trainTestSplit.getValue()) {
                aggrResult.testSetName = result.testSetName;
                aggrResult.trainSetName = result.trainSetName;
                aggrResult.classifierParameters = result.classifierParameters;
                aggrResult.featureSetName = result.featureSetName;

                recall += result.recall;
                fMeasure += result.fMeasure;
                precision += result.precision;
                accuracy += result.accuracy;
                nrClassifiers++;
            }

            aggrResult.accuracy = (accuracy / nrClassifiers);
            aggrResult.fMeasure = (fMeasure / nrClassifiers);
            aggrResult.recall = (recall / nrClassifiers);
            aggrResult.precision = (precision / nrClassifiers);

            aggregatedResults.add(aggrResult);
        }

        // write values of measure
        for (ExternalResults result : aggregatedResults) {
            String outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Percent Correct", result.accuracy);
            outputRows.add(outputRow);

            outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Weighted Precision", result.precision);
            outputRows.add(outputRow);

            outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Weighted Recall", result.recall);
            outputRows.add(outputRow);

            outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Weighted F-Measure", result.fMeasure);
            outputRows.add(outputRow);

        }

        // Write aggregated data to a new file
        try {
            PrintWriter out = new PrintWriter(new FileWriter(outputFile, true));
            for (String s : outputRows) {
                out.println(s);
            }
            out.close();
        } catch (IOException e) {
            System.err.println("Error while writing aggregated Train-Test file.");
            e.printStackTrace();
        }

        logger.log(Level.INFO,
                String.format("Finished import. The aggregated data was written to %s.", outFileName));
    }

    public static void readMUGCCV(String filePath) {
        String outFileName = "AggregatedTrainTest.csv";

        logger.log(Level.INFO, String.format("Importing data from directory %s.", filePath));

        // Method requires input directory. Check this condition.
        File directory = new File(filePath);
        if (directory.isDirectory()) {
            System.err.println("Please specify a file. Aborting.");
            return;
        }

        //Empty previous output file, if there was one
        File outputFile = new File(directory.getParentFile(), outFileName);
        if (outputFile.exists()) {
            outputFile.delete();
        }
        try {
            String header = "Train;Test;Classifier;FeatureSet;Measure;Value";

            PrintWriter out = new PrintWriter(new FileWriter(outputFile, true));
            out.println(header);
            out.close();
        } catch (IOException e) {
            System.err.println("Error while writing aggregated Train-Test file.");
            e.printStackTrace();
        }

        ArrayList<String> outputRows = new ArrayList<String>();

        // iterate all rows
        List<String[]> inputRowsFirstFile = new ArrayList<>();
        inputRowsFirstFile = readAndCheckCSV(filePath, ';');

        // first: order by train set
        ArrayList<ExternalResults> extResults = new ArrayList<>();

        for (int i = 0; i < inputRowsFirstFile.size(); i++) {
            ExternalResults results = new ExternalResults();

            // identify current train/test split
            String[] datasetNames = inputRowsFirstFile.get(i)[0].split(",");
            results.trainSetName = datasetNames[0].replace("CV: ", "").replace(" ", "");

            // set classifier name
            results.classifierParameters = inputRowsFirstFile.get(i)[1];

            // read feature set
            results.featureSetName = inputRowsFirstFile.get(i)[2];

            // read classification results
            results.recall = Double.parseDouble(inputRowsFirstFile.get(i)[3]);
            results.fMeasure = Double.parseDouble(inputRowsFirstFile.get(i)[4]);
            results.precision = Double.parseDouble(inputRowsFirstFile.get(i)[5]);
            results.accuracy = Double.parseDouble(inputRowsFirstFile.get(i)[10]) / 100;

            extResults.add(results);
        }

        HashMap<String, ArrayList<ExternalResults>> extResultsByTrainTestFeature = new HashMap<>();

        // order by test set
        for (ExternalResults result : extResults) {
            String IdKey = result.trainSetName + result.testSetName + result.featureSetName;

            if (extResultsByTrainTestFeature.containsKey(IdKey)) {
                extResultsByTrainTestFeature.get(IdKey).add(result);
            } else {
                extResultsByTrainTestFeature.put(IdKey, new ArrayList<ExternalResults>());
                extResultsByTrainTestFeature.get(IdKey).add(result);
            }
        }

        ArrayList<ExternalResults> aggregatedResults = new ArrayList<>();

        // aggregate results or keep as are
        for (Entry<String, ArrayList<ExternalResults>> trainTestSplit : extResultsByTrainTestFeature.entrySet()) {
            ExternalResults aggrResult = new ExternalResults();

            double recall = 0;
            double fMeasure = 0;
            double precision = 0;
            double accuracy = 0;
            int nrClassifiers = 0;

            // for all entries that are from the same train/test split and use the same feature set -> aggregate results
            for (ExternalResults result : trainTestSplit.getValue()) {
                aggrResult.testSetName = result.testSetName;
                aggrResult.trainSetName = result.trainSetName;
                aggrResult.classifierParameters = result.classifierParameters;
                aggrResult.featureSetName = result.featureSetName;

                recall += result.recall;
                fMeasure += result.fMeasure;
                precision += result.precision;
                accuracy += result.accuracy;
                nrClassifiers++;
            }

            aggrResult.accuracy = (accuracy / nrClassifiers);
            aggrResult.fMeasure = (fMeasure / nrClassifiers);
            aggrResult.recall = (recall / nrClassifiers);
            aggrResult.precision = (precision / nrClassifiers);

            aggregatedResults.add(aggrResult);
        }

        // write values of measure
        for (ExternalResults result : aggregatedResults) {
            String outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Percent Correct", result.accuracy);
            outputRows.add(outputRow);

            outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Weighted Precision", result.precision);
            outputRows.add(outputRow);

            outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Weighted Recall", result.recall);
            outputRows.add(outputRow);

            outputRow = String.format("%s;%s;%s;%s;%s;%s", result.trainSetName, result.testSetName, "0",
                    result.featureSetName, "Weighted F-Measure", result.fMeasure);
            outputRows.add(outputRow);

        }

        // Write aggregated data to a new file
        try {
            PrintWriter out = new PrintWriter(new FileWriter(outputFile, true));
            for (String s : outputRows) {
                out.println(s);
            }
            out.close();
        } catch (IOException e) {
            System.err.println("Error while writing aggregated Train-Test file.");
            e.printStackTrace();
        }

        logger.log(Level.INFO,
                String.format("Finished import. The aggregated data was written to %s.", outFileName));
    }

    public static void readLODPipelineTrainTest(String pathToDirectory) {
        Locale.setDefault(Locale.ENGLISH);

        String[] semanticFeatures = new String[] { "Baseline", "+ALL", "+LOC", "+TIME", "+LOD", "+LOC+TIME",
                "+LOC+LOD", "+TIME+LOD", "+TYPES", "+CAT" };
        String[] measures = new String[] { "Percent Correct", "Weighted Precision", "Weighted Recall",
                "Weighted F-Measure" };
        String outFileName = "AggregatedCVRandom.csv";

        logger.log(Level.INFO, String.format("Importing data from directory %s.", pathToDirectory));

        // Method requires input directory. Check this condition.
        File directory = new File(pathToDirectory);
        if (!directory.isDirectory()) {
            System.err.println("Please specify a directory with the source .csv files. Aborting.");
            return;
        }

        //Empty previous output file, if there was one
        File outputFile = new File(directory, outFileName);
        if (outputFile.exists()) {
            outputFile.delete();
        }
        try {
            String header = "Train;Test;Classifier;FeatureSet;Measure;Value";

            PrintWriter out = new PrintWriter(new FileWriter(outputFile, true));
            out.println(header);
            out.close();
        } catch (IOException e) {
            System.err.println("Error while writing aggregated Train-Test file.");
            e.printStackTrace();
        }

        // prepare files lists
        HashMap<String, ArrayList<File>> filesMap = new HashMap<>();

        // read all subdirectories that match the city names
        File[] subdirs = directory.listFiles((FileFilter) DirectoryFileFilter.DIRECTORY);

        //Iterate all subdirectories
        for (File subDirectory : subdirs) {

            // get train set name
            String trainSetName = subDirectory.getName();

            // iterate all files in directory
            File[] filesInDirectory = subDirectory.listFiles();
            List<File> fileList = Arrays.asList(filesInDirectory);

            for (File subDirFile : fileList) {
                // get name of test data set
                String[] filenameTokens = subDirFile.getName().split("To");
                //String testDataName = filenameTokens[1].substring(0, filenameTokens[1].length() - 11);

                String testDataName;

                // if only this string is left, then CV
                if (filenameTokens[1].equals("Results.csv")) {
                    testDataName = trainSetName;
                } else {
                    testDataName = filenameTokens[1].split("Results.csv")[0];
                    testDataName = testDataName.split("2C.csv|4C.csv|.csv")[0];
                }

                // put current file to test data name -> this way all files
                // corresponding to the same test set are in one map
                if (filesMap.get(testDataName) != null) {
                    // get existing list and add file
                    ArrayList<File> currentFileList = filesMap.get(testDataName);
                    currentFileList.add(subDirFile);
                } else {
                    // create new list and add current file
                    ArrayList<File> newFileList = new ArrayList<>();
                    newFileList.add(subDirFile);
                    filesMap.put(testDataName, newFileList);
                }
            }

            ArrayList<String> outputRows = new ArrayList<String>();
            int nrDifferentClassifiers = 0;

            // iterate all files of one map
            Iterator<Entry<String, ArrayList<File>>> it = filesMap.entrySet().iterator();
            while (it.hasNext()) {
                Map.Entry pairs = (Map.Entry) it.next();
                String testSetName = (String) pairs.getKey();
                ArrayList<File> testFiles = (ArrayList<File>) pairs.getValue();

                nrDifferentClassifiers = testFiles.size();

                // initialize data store
                ArrayList<HashMap<String, Object>> values = new ArrayList<>();

                // get rows for first file to initialize store
                List<String[]> inputRowsFirstFile = readAndCheckCSV(testFiles.get(0).getAbsolutePath(), ';');

                for (int i = 0; i < inputRowsFirstFile.size(); i++) {
                    HashMap<String, Object> currentRowValues = new HashMap<>();
                    currentRowValues.put("semanticFeature", "");
                    currentRowValues.put("classifierParameters", "");
                    currentRowValues.put("aggregatedMeasureValues", new double[measures.length]);
                    currentRowValues.put("nGrams", "");
                    values.add(currentRowValues);
                }

                // get results from other files
                for (File testFile : testFiles) {
                    // Only analyse files with .csv extension
                    if (!FilenameUtils.getExtension(testFile.getName().toLowerCase()).equals("csv")
                            || testFile.getName().equals("AggregatedTrainTest.csv")) {
                        continue;
                    }
                    // check file for consistency
                    List<String[]> inputRows = readAndCheckCSV(testFile.getAbsolutePath(), ';');

                    // check if length matches first file
                    if (!(inputRows.size() == values.size())) {
                        // TODO error message
                    } else {
                        for (int i = 0; i < inputRows.size(); i++) {
                            String[] inputCells = inputRows.get(i);

                            // read current values and compare with entries
                            String semanticFeature = semanticFeatures[i % semanticFeatures.length];

                            if (values.get(i).get("semanticFeature") == "") {
                                values.get(i).put("semanticFeature", semanticFeature);
                            } else {
                                if (values.get(i).get("semanticFeature").equals(semanticFeature) == false) {
                                    System.err.println("Semantic Features do not match.");
                                    System.exit(1);
                                }
                            }

                            // needs rework as we do aggregation here
                            // String classifierParameters = inputCells[0];
                            //
                            // if (values.get(i).get("classifierParameters") ==
                            // "")
                            // {
                            // values.get(i).put("classifierParameters",
                            // classifierParameters);
                            // }
                            // else
                            // {
                            // if
                            // (values.get(i).get("classifierParameters").equals(classifierParameters)
                            // == false)
                            // {
                            // System.err.println("Classifier parameters do not match.");
                            // System.exit(1);
                            // }
                            // }

                            String nGrams = inputCells[12];

                            if (values.get(i).get("nGrams") == "") {
                                values.get(i).put("nGrams", nGrams);
                            } else {
                                if (values.get(i).get("nGrams").equals(nGrams) == false) {
                                    System.err.println("N Gram Length does not match.");
                                    System.exit(1);
                                }
                            }

                            // get and aggregate values
                            for (int j = 0; j < measures.length; j++) {
                                if (j == 0) {
                                    //double currentValue = ((double[]) values.get(i).get("aggregatedMeasureValues"))[j];
                                    double valueInFile = Double.parseDouble(inputCells[j + 16]) / 100;

                                    ((double[]) values.get(i).get("aggregatedMeasureValues"))[j] += valueInFile;
                                } else {
                                    //double currentValue = ((double[]) values.get(i).get("aggregatedMeasureValues"))[j];
                                    double valueInFile = Double.parseDouble(inputCells[j + 16]);
                                    ((double[]) values.get(i).get("aggregatedMeasureValues"))[j] += valueInFile;
                                }
                            }
                        }
                    }
                }

                // write aggregated results to file
                for (HashMap<String, Object> currentValues : values) {
                    String semFeature = (String) currentValues.get("semanticFeature");
                    String nGrams = (String) currentValues.get("nGrams");
                    String featureSet = String.format("%s, nGrams: %s", semFeature, nGrams);

                    for (int j = 0; j < measures.length; j++) {
                        String outputRow = String.format("%s;%s;%s;%s;%s;%f", trainSetName, testSetName, "0",
                                featureSet, measures[j],
                                ((double[]) currentValues.get("aggregatedMeasureValues"))[j]
                                        / nrDifferentClassifiers);
                        outputRows.add(outputRow);
                    }
                }

                // avoids a ConcurrentModificationException
                it.remove();
            }

            // Write aggregated data to a new file
            try {
                PrintWriter out = new PrintWriter(new FileWriter(outputFile, true));
                for (String s : outputRows) {
                    out.println(s);
                }
                out.close();
            } catch (IOException e) {
                System.err.println("Error while writing aggregated Train-Test file.");
                e.printStackTrace();
            }
        }

        logger.log(Level.INFO,
                String.format("Finished import. The aggregated data was written to %s.", outFileName));

    }

    public static SampleData interpretCSV(StatsConfig config, List<String[]> rows, ReportTypes pipelineType,
            HashMap<String, Integer> pipelineMetadata) {

        HashMap<Integer, ArrayList<ArrayList<Double>>> samplesPerMeasure = new HashMap<Integer, ArrayList<ArrayList<Double>>>();

        //Only remove first line if it is a header line
        if (rows.size() > 0 && rows.get(0)[6].equals("IsBaseline")) {
            rows.remove(0);
        }

        if (rows.size() > 1) {

            logger.log(Level.INFO, "Extracting samples and metadata from imported data.");
            int selectBestN = config.getSelectBestN();
            String selectByMeasure = config.getSelectByMeasure();

            // Preprocessing: Parse different models (classifier + feature set column) and measures
            ArrayList<String> measures = new ArrayList<String>();
            ArrayList<Pair<String, String>> datasets = new ArrayList<Pair<String, String>>();
            ArrayList<Pair<String, String>> models = new ArrayList<Pair<String, String>>();
            ArrayList<Pair<String, String>> baselineModels = new ArrayList<Pair<String, String>>();

            for (int i = 0; i < rows.size(); i++) {
                String[] columns = rows.get(i);
                String classifier = columns[2];
                if (classifier.equals("0")) {
                    classifier = "Aggregated";
                }
                String featureSets = columns[3];
                Pair<String, String> model = Pair.of(classifier, featureSets);
                if (!models.contains(model)) {
                    models.add(model);
                    if (!baselineModels.contains(model) && Integer.parseInt(columns[6]) == 1) {
                        baselineModels.add(model);
                    }
                }
                if (!measures.contains(columns[4])) {
                    measures.add(columns[4]);
                }
            }

            //Check: Baseline only allowed when > 2 models are evaluated
            if (models.size() <= 2 && baselineModels.size() > 0) {
                logger.log(Level.WARN,
                        "At least three models are required to make an evaluation against a baseline meaningful. In the dataset, a baseline was specified for only two models. The baseline indicator will be ignored.");
                System.err.println(
                        "At least three models are required to make an evaluation against a baseline meaningful. In the dataset, a baseline was specified for only two models. The baseline indicator will be ignored.");
                baselineModels.clear();
            }

            // Now sort samples according to data
            Collections.sort(rows, new Helpers.LexicographicArrayComparator());
            for (int i = 0; i < rows.size(); i++) {
                String[] columns = rows.get(i);
                Pair<String, String> data = null;
                String trainData = columns[0].trim();
                String testData = columns[1].trim();

                //If this is a CV, numbers after a dot indicate fold UUIDS, they thus have to be splitted to retain the original dataset name
                if (pipelineType == ReportTypes.CV) {
                    trainData = trainData.split("\\.")[0];
                    testData = testData.split("\\.")[0];
                }

                if (trainData.equals(testData)) {
                    data = Pair.of(trainData, null);
                } else {
                    //columns[1] = columns[1].split(".")[0];
                    data = Pair.of(trainData, testData);
                }
                if (!datasets.contains(data)) {
                    datasets.add(data);
                }
            }

            // Preprocessing: Initialize sample container per measure/model
            for (int i = 0; i < measures.size(); i++) {
                ArrayList<ArrayList<Double>> samplesPerModel = new ArrayList<ArrayList<Double>>();
                for (int j = 0; j < models.size(); j++) {
                    samplesPerModel.add(new ArrayList<Double>());
                }
                samplesPerMeasure.put(i, samplesPerModel);
            }

            // Assign samples to different models
            for (int i = 0; i < rows.size(); i++) {
                String[] columns = rows.get(i);
                String classifier = columns[2];
                if (classifier.equals("0")) {
                    classifier = "Aggregated";
                }
                String featureSet = columns[3];
                String measure = columns[4];
                double value = Double.parseDouble(columns[5]);

                int measureIndex = measures.indexOf(measure);
                int modelIndex = models.indexOf(Pair.of(classifier, featureSet));

                ArrayList<ArrayList<Double>> sPMeasure = samplesPerMeasure.get(measureIndex);
                sPMeasure.get(modelIndex).add(value);
            }

            // Transform into data format required by the statistical evaluation
            HashMap<String, ArrayList<ArrayList<Double>>> indexedSamples = new HashMap<String, ArrayList<ArrayList<Double>>>();
            HashMap<String, ArrayList<Double>> indexedSamplesAverage = new HashMap<String, ArrayList<Double>>();

            Iterator<Integer> it = samplesPerMeasure.keySet().iterator();
            while (it.hasNext()) {
                int measureIndex = it.next();
                ArrayList<ArrayList<Double>> samplesPerModel = samplesPerMeasure.get(measureIndex);

                ArrayList<Double> sampleAverages = new ArrayList<Double>(models.size());
                for (int modelIndex = 0; modelIndex < models.size(); modelIndex++) {
                    ArrayList<Double> sample = samplesPerModel.get(modelIndex);
                    double average = 0;
                    for (int j = 0; j < sample.size(); j++) {
                        average += sample.get(j);
                    }
                    average /= sample.size();
                    sampleAverages.add(average);
                }
                indexedSamplesAverage.put(measures.get(measureIndex), sampleAverages);
                indexedSamples.put(measures.get(measureIndex), samplesPerMeasure.get(measureIndex));
            }

            // Check if data fulfills general requirements: > 5 samples for each model, same number of samples per model
            it = samplesPerMeasure.keySet().iterator();
            while (it.hasNext()) {
                Integer measureIndex = it.next();
                ArrayList<ArrayList<Double>> samplesPerModel = samplesPerMeasure.get(measureIndex);
                int s = samplesPerModel.get(0).size();

                for (int i = 1; i < samplesPerModel.size(); i++) {
                    if (samplesPerModel.get(i).size() < 5) {
                        logger.log(Level.ERROR, "More than 5 samples are needed per model and measure. Aborting.");
                        System.err.println("More than 5 samples are needed per model and measure. Aborting.");
                        System.exit(1);
                    }
                    if (samplesPerModel.get(i).size() != s) {
                        logger.log(Level.ERROR,
                                "Different models are not represented by the same number of samples. Aborting.");
                        System.err.println(
                                "Different models are not represented by the same number of samples. Aborting.");
                        System.exit(1);
                    }
                }
            }

            // Collect remaining data required for creating a SampleData object
            // Check if data fulfills requirements of the specific PipelineTypes
            int nFolds = 1;
            int nRepetitions = 1;
            switch (pipelineType) {
            case CV:
                if (datasets.size() > 1) {
                    System.err.println(
                            "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation.");
                    logger.log(Level.ERROR,
                            "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation.");
                    return null;
                } else if (datasets.get(0).getValue() != null) {
                    System.err.println(
                            "Input data corrupted. Training and Test dataset must be same for Cross-Validation.");
                    logger.log(Level.ERROR,
                            "Input data corrupted. Training and Test dataset must be same for Cross-Validation.");
                    return null;
                }
                nFolds = indexedSamples.get(measures.get(0)).get(0).size();
                nRepetitions = 1;
                break;
            case MULTIPLE_CV:
                if (datasets.size() > 1) {
                    System.err.println(
                            "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation.");
                    logger.log(Level.ERROR,
                            "Input data corrupted. More than one dataset specified for Single-Domain Cross-Validation.");
                    return null;
                } else if (datasets.get(0).getValue() != null) {
                    System.err.println(
                            "Input data corrupted. Training and Test dataset must be same for Cross-Validation.");
                    logger.log(Level.ERROR,
                            "Input data corrupted. Training and Test dataset must be same for Cross-Validation.");
                    return null;
                }
                nFolds = pipelineMetadata.get("nFolds");
                nRepetitions = indexedSamples.get(measures.get(0)).get(0).size();
                break;
            case CV_DATASET_LVL:
                nFolds = pipelineMetadata.get("nFolds");
                nRepetitions = 1;
                break;
            case MULTIPLE_CV_DATASET_LVL:
                nFolds = pipelineMetadata.get("nFolds");
                nRepetitions = pipelineMetadata.get("nRepetitions");
                break;
            case TRAIN_TEST_DATASET_LVL:
                nFolds = 1;
                nRepetitions = 1;
                break;
            default:
                System.err.println("Unknown PipelineType. Aborting.");
                logger.log(Level.ERROR, "Unknown PipelineType. Aborting.");
                return null;
            }

            //Reorder data in case of a baseline evaluation (baseline first)
            if (baselineModels.size() == 1) {
                Pair<String, String> baselineModel = baselineModels.get(0);
                int modelIndex = models.indexOf(baselineModel);
                models.remove(modelIndex);
                models.add(0, baselineModel);
                for (String measure : indexedSamples.keySet()) {
                    ArrayList<Double> s = indexedSamples.get(measure).get(modelIndex);
                    indexedSamples.get(measure).remove(modelIndex);
                    indexedSamples.get(measure).add(0, s);
                    double a = indexedSamplesAverage.get(measure).get(modelIndex);
                    indexedSamplesAverage.get(measure).remove(modelIndex);
                    indexedSamplesAverage.get(measure).add(0, a);
                }
            }

            SampleData sampleData = new SampleData(null, indexedSamples, indexedSamplesAverage, datasets, models,
                    baselineModels, pipelineType, nFolds, nRepetitions);
            sampleData = Helpers.truncateData(sampleData, selectBestN, selectByMeasure);

            return sampleData;
        }
        return null;
    }

    public static List<SampleData> splitData(SampleData data, StatsConfig config) {

        List<SampleData> splitted = new ArrayList<SampleData>();

        //Use lists instead of sets to maintain order of model metadata
        ArrayList<String> featureSets = new ArrayList<String>();
        ArrayList<String> classifiers = new ArrayList<String>();
        for (Pair<String, String> metadata : data.getModelMetadata()) {
            if (!classifiers.contains(metadata.getLeft())) {
                classifiers.add(metadata.getLeft());
            }
            if (!featureSets.contains(metadata.getRight())) {
                featureSets.add(metadata.getRight());
            }
        }

        //Only separate data if there's more than one independent variable
        if (!(featureSets.size() > 1 && classifiers.size() > 1)) {
            splitted.add(data);
            return splitted;
        }

        List<String> it = (config
                .getFixIndependentVariable() == StatsConfigConstants.INDEPENDENT_VARIABLES_VALUES.Classifier)
                        ? classifiers
                        : featureSets;
        for (String fixed : it) {
            ArrayList<Pair<String, String>> modelMetadata = new ArrayList<Pair<String, String>>();
            HashMap<String, ArrayList<ArrayList<Double>>> samples = new HashMap<String, ArrayList<ArrayList<Double>>>();
            HashMap<String, ArrayList<Double>> sampleAverages = new HashMap<String, ArrayList<Double>>();
            for (int i = 0; i < data.getModelMetadata().size(); i++) {
                Pair<String, String> model = data.getModelMetadata().get(i);
                boolean eq = (config
                        .getFixIndependentVariable() == StatsConfigConstants.INDEPENDENT_VARIABLES_VALUES.Classifier)
                                ? model.getLeft().equals(fixed)
                                : model.getRight().equals(fixed);
                if (eq) {
                    modelMetadata.add(model);
                    for (String measure : data.getSamples().keySet()) {
                        if (!samples.containsKey(measure)) {
                            samples.put(measure, new ArrayList<ArrayList<Double>>());
                            sampleAverages.put(measure, new ArrayList<Double>());
                        }
                        samples.get(measure).add(data.getSamples().get(measure).get(i));
                        sampleAverages.get(measure).add(data.getSamplesAverage().get(measure).get(i));
                    }
                }
            }
            ArrayList<Pair<String, String>> baselineModelData = new ArrayList<Pair<String, String>>();
            if (data.isBaselineEvaluation()) {
                Pair<String, String> baselineModel = null;
                for (int i = 0; i < data.getBaselineModelMetadata().size(); i++) {
                    boolean eq = (config
                            .getFixIndependentVariable() == StatsConfigConstants.INDEPENDENT_VARIABLES_VALUES.Classifier)
                                    ? data.getBaselineModelMetadata().get(i).getLeft().equals(fixed)
                                    : data.getBaselineModelMetadata().get(i).getRight().equals(fixed);
                    if (eq) {
                        baselineModel = data.getBaselineModelMetadata().get(i);
                        break;
                    }
                }
                if (baselineModel != null) {
                    baselineModelData.add(baselineModel);
                    int modelIndex = modelMetadata.indexOf(baselineModel);
                    modelMetadata.remove(modelIndex);
                    modelMetadata.add(0, baselineModel);
                    for (String measure : data.getSamples().keySet()) {
                        ArrayList<Double> s = samples.get(measure).get(modelIndex);
                        samples.get(measure).remove(modelIndex);
                        samples.get(measure).add(0, s);
                        double a = sampleAverages.get(measure).get(modelIndex);
                        sampleAverages.get(measure).remove(modelIndex);
                        sampleAverages.get(measure).add(0, a);
                    }
                } else {
                    logger.log(Level.ERROR,
                            "Missing baseline model! Please check if baseline indicators are set correctly in the input file, and if they correspond correctly to the fixIndependentVariable property in the configuration. In case of both varying feature sets and classifiers, baseline indicators have to be set multiple times.");
                    System.err.println(
                            "Missing baseline model! Please check if baseline indicators are set correctly in the input file, and if they correspond correctly to the fixIndependentVariable property in the configuration. In case of both varying feature sets and classifiers, baseline indicators have to be set multiple times.");
                    System.exit(1);
                }
            }
            SampleData newData = new SampleData(null, samples, sampleAverages, data.getDatasetNames(),
                    modelMetadata, baselineModelData, data.getPipelineType(), data.getnFolds(),
                    data.getnRepetitions());
            splitted.add(newData);
        }
        return splitted;
    }

    /**
     * Read csv file, split each line by the specified separator and check
     * whether each line can be split into the same number of columns
     * 
     * @param pathToCsvFile the path to the .csv file
     * @param separator the separator to be used to split a line in separate cells, each relating to one column ArrayList<String[]> containing all lines split into tokens
     */
    public static List<String[]> readAndCheckCSV(String pathToCsvFile, char separator) {
        List<String[]> rows = new ArrayList<String[]>();
        try {
            CSVReader reader = new CSVReader(new FileReader(pathToCsvFile), separator);
            rows = reader.readAll();
            reader.close();

            if (rows.size() > 0) {
                for (String[] row : rows) {
                    if (row.length != rows.get(0).length) {
                        logger.log(Level.ERROR, ".csv file corrupt: number of columns not same for each row.");
                        System.err.println(".csv file corrupt: number of columns not same for each row.");
                        System.exit(1);
                    }
                    if (row.length != 7) {
                        logger.log(Level.ERROR, ".csv file corrupt: must contain exactly 7 columns.");
                        System.err.println(".csv file corrupt: must contain exactly 7 columns.");
                        System.exit(1);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            logger.log(Level.ERROR, "Input .csv file not found!");
            System.err.println("Input .csv file not found!");
            System.exit(1);
        } catch (IOException e) {
            logger.log(Level.ERROR, "Exception while reading input data .csv!");
            System.err.println("Exception while reading input data .csv!");
            e.printStackTrace();
            System.exit(1);
        }
        return rows;
    }

}