rgu.jclos.foldbuilder.FoldBuilder.java Source code

Java tutorial

Introduction

Here is the source code for rgu.jclos.foldbuilder.FoldBuilder.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package rgu.jclos.foldbuilder;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.tuple.Pair;

/**
 *
 * @author Jeremie
 */
public class FoldBuilder {
    public static class Instance {
        public String content;
        public String label;
    }

    public static void main(String[] args) throws ParseException, IOException {
        Options options = new Options();
        options.addOption("i", "input", true, "Input file (mandatory)");
        options.addOption("k", true, "Number of folds (defaults to 2)");
        options.addOption("o", "output", true, "Output directory (defaults to current directory)");
        options.addOption("s", "separator", true, "Separating character in the CSV file (defaults to \\t)");
        options.addOption("idx", "index", true, "Index of the label (index or first or last) (defaults to last)");
        options.addOption("v", "verbose", false, "Print messages during fold generation (defaults to true)");
        CommandLineParser parser = new DefaultParser();
        CommandLine cmd = parser.parse(options, args);

        String inputFile;
        String outputDirectory;
        String separator;
        String indexLabel;
        int k;
        boolean speak = false;

        if (cmd.hasOption("input")) {
            inputFile = cmd.getOptionValue("input");
        } else {
            throw new IllegalArgumentException("You need to provide an input file.");
        }

        k = Integer.parseInt(cmd.getOptionValue("k"), 2);
        outputDirectory = cmd.getOptionValue("output", ".");
        separator = cmd.getOptionValue("separator", "\t");
        indexLabel = cmd.getOptionValue("index", "last");

        if (cmd.hasOption("verbose")) {
            speak = true;
        }
        computeAndWriteFolds(inputFile, outputDirectory, separator, indexLabel, k, speak);
    }

    public static void main2(String[] args) throws IOException {
        String inputFile = args[0];
        String outputFile = args[1];
        String separator = args[2];
        int indexLabel = Integer.parseInt(args[3]);
        int k = Integer.parseInt(args[4]);
        boolean speak = Boolean.parseBoolean(args[5]);
        computeAndWriteFolds(inputFile, outputFile, separator, indexLabel, k, speak);
    }

    /**
     * Generates K folds and writes them to disk
     * @param inputFile The CSV file from which the data comes from.
     * @param outputDirectory The directory in which the folds will be written.
     * @param separator The separating character in the CSV file.
     * @param indexLabel The index of the labels in the CSV file. Used for stratification of the folds.
     * @param k The number of folds to generates.
     * @param speak Whether to print some status messages along the way.
     * @throws IOException If something stops the program from reading or writing the files.
     */
    public static void computeAndWriteFolds(String inputFile, String outputDirectory, String separator,
            int indexLabel, int k, boolean speak) throws IOException {
        Pair<List<Set<String>>, Map<String, Instance>> tmp = getFolds(inputFile, outputDirectory, separator,
                indexLabel, k, speak);
        Map<String, Instance> dictionary = tmp.getRight();
        List<Set<String>> folds = tmp.getLeft();
        if (speak)
            System.out.println("Writing folds on disk");
        for (int i = 0; i < folds.size(); i++) {
            Set<String> trainingSet = new HashSet<>();
            Set<String> testSet = new HashSet<>();
            Set<String> trainingSetIds = new HashSet<>();
            trainingSetIds.addAll(folds.get(i));
            Set<String> testSetIds = new HashSet<>();
            for (int j = 0; j < folds.size(); j++) {
                if (i != j) {
                    testSetIds.addAll(folds.get(j));
                }
            }
            trainingSetIds.forEach(tid -> {
                trainingSet.add(dictionary.get(tid).content);
            });
            testSetIds.forEach(tid -> {
                testSet.add(dictionary.get(tid).content);
            });

            String filenameTraining = "Fold_" + (i + 1) + "_TrainingSet.csv";
            String filenameTesting = "Fold_" + (i + 1) + "_TestingSet.csv";
            File outputTraining = new File(outputDirectory + File.separator + filenameTraining);
            File outputTesting = new File(outputDirectory + File.separator + filenameTesting);
            Files.write(outputTraining.toPath(), trainingSet);
            Files.write(outputTesting.toPath(), testSet);
        }
    }

    /**
    * Generates K folds and writes them to disk
    * @param inputFile The CSV file from which the data comes from.
    * @param outputDirectory The directory in which the folds will be written.
    * @param separator The separating character in the CSV file.
    * @param indexLabel The index of the labels in the CSV file. Used for stratification of the folds.
    * @param k The number of folds to generates.
    * @param speak Whether to print some status messages along the way.
    * @throws IOException If something stops the program from reading or writing the files.
    */
    private static void computeAndWriteFolds(String inputFile, String outputDirectory, String separator,
            String indexLabel, int k, boolean speak) throws IOException {
        Pair<List<Set<String>>, Map<String, Instance>> tmp = getFolds(inputFile, outputDirectory, separator,
                indexLabel, k, speak);
        Map<String, Instance> dictionary = tmp.getRight();
        List<Set<String>> folds = tmp.getLeft();
        if (speak)
            System.out.println("Writing folds on disk");
        for (int i = 0; i < folds.size(); i++) {
            Set<String> trainingSet = new HashSet<>();
            Set<String> testSet = new HashSet<>();
            Set<String> trainingSetIds = new HashSet<>();
            trainingSetIds.addAll(folds.get(i));
            Set<String> testSetIds = new HashSet<>();
            for (int j = 0; j < folds.size(); j++) {
                if (i != j) {
                    testSetIds.addAll(folds.get(j));
                }
            }
            trainingSetIds.forEach(tid -> {
                trainingSet.add(dictionary.get(tid).content);
            });
            testSetIds.forEach(tid -> {
                testSet.add(dictionary.get(tid).content);
            });

            String filenameTraining = "Fold_" + (i + 1) + "_TrainingSet.csv";
            String filenameTesting = "Fold_" + (i + 1) + "_TestingSet.csv";
            File outputTraining = new File(outputDirectory + File.separator + filenameTraining);
            File outputTesting = new File(outputDirectory + File.separator + filenameTesting);
            Files.write(outputTraining.toPath(), trainingSet);
            Files.write(outputTesting.toPath(), testSet);
        }
    }

    /**
     * Generates K folds and writes them to disk
     * @param inputFile The CSV file from which the data comes from.
     * @param outputDirectory The directory in which the folds will be written.
     * @param separator The separating character in the CSV file.
     * @param indexLabel The index of the labels in the CSV file. Used for stratification of the folds.
     * @param k The number of folds to generates.
     * @param speak Whether to print some status messages along the way.
     * @return A pair containing a list of folds with ids of documents, and a dictionary that allows the user to retrieve aformentioned documents using the ids, in order to save space.
     * @throws IOException If something stops the program from reading or writing the files.
     */
    public static Pair<List<Set<String>>, Map<String, Instance>> getFolds(String inputFile, String outputDirectory,
            String separator, int indexLabel, int k, boolean speak) throws IOException {
        Random rng = new Random();
        Map<String, Instance> dictionary = new HashMap<>();
        Map<String, Integer> classes = new HashMap<>();
        Map<String, List<String>> reversedDictionary = new HashMap<>();
        int id = 0;
        for (String line : Files.readAllLines(new File(inputFile).toPath())) {
            Instance inst = new Instance();
            String[] elements = line.split(separator);
            inst.content = line;
            inst.label = elements[indexLabel];
            String iid = "inst" + id;
            dictionary.put(iid, inst);
            classes.put(inst.label, classes.getOrDefault(inst.label, 0) + 1);
            if (reversedDictionary.containsKey(inst.label)) {
                reversedDictionary.get(inst.label).add(iid);
            } else {
                List<String> ids = new ArrayList<>();
                ids.add(iid);
                reversedDictionary.put(inst.label, ids);
            }
            id++;
        }

        int numberOfInstances = id;
        int sizeOfEachFold = (int) Math.floor(numberOfInstances / k);
        Map<String, Double> classRatios = new HashMap<>();
        for (Map.Entry<String, Integer> classFrequency : classes.entrySet()) {
            classRatios.put(classFrequency.getKey(),
                    (double) classFrequency.getValue() / (double) numberOfInstances);
        }

        List<Set<String>> folds = new ArrayList<>();
        for (int i = 0; i < k; i++) {
            Set<String> fold = new HashSet<>();
            for (Map.Entry<String, List<String>> c : reversedDictionary.entrySet()) {
                int currentSize = fold.size();
                int numberRequired = (int) Math.floor(classRatios.get(c.getKey()) * sizeOfEachFold);
                while (fold.size() < currentSize + numberRequired && c.getValue().size() > 0) {
                    int nextPick = rng.nextInt(c.getValue().size());
                    fold.add(c.getValue().get(nextPick));
                    c.getValue().remove(nextPick);
                }
            }
            folds.add(fold);
            if (speak)
                System.out.println("Finished computing fold " + (i + 1) + " of size " + fold.size());
        }

        if (speak)
            System.out.println("Writing folds on disk");

        return Pair.of(folds, dictionary);
    }

    /**
    * Generates K folds and writes them to disk
    * @param inputFile The CSV file from which the data comes from.
    * @param outputDirectory The directory in which the folds will be written.
    * @param separator The separating character in the CSV file.
    * @param indexLabel The index of the labels in the CSV file. Used for stratification of the folds.
    * @param k The number of folds to generates.
    * @param speak Whether to print some status messages along the way.
    * @return A pair containing a list of folds with ids of documents, and a dictionary that allows the user to retrieve aformentioned documents using the ids, in order to save space.
    * @throws IOException If something stops the program from reading or writing the files.
    */
    private static Pair<List<Set<String>>, Map<String, Instance>> getFolds(String inputFile, String outputDirectory,
            String separator, String indexLabel, int k, boolean speak) throws IOException {
        Random rng = new Random();
        Map<String, Instance> dictionary = new HashMap<>();
        Map<String, Integer> classes = new HashMap<>();
        Map<String, List<String>> reversedDictionary = new HashMap<>();
        int id = 0;

        List<String> lines = Files.readAllLines(new File(inputFile).toPath());
        String[] elts = lines.get(0).split(separator);
        int labIndex = indexLabel.equals("first") ? 0
                : indexLabel.equals("last") ? elts.length - 1 : Integer.parseInt(indexLabel);

        for (String line : Files.readAllLines(new File(inputFile).toPath())) {
            Instance inst = new Instance();
            String[] elements = line.split(separator);
            inst.content = line;
            inst.label = elements[labIndex];
            String iid = "inst" + id;
            dictionary.put(iid, inst);
            classes.put(inst.label, classes.getOrDefault(inst.label, 0) + 1);
            if (reversedDictionary.containsKey(inst.label)) {
                reversedDictionary.get(inst.label).add(iid);
            } else {
                List<String> ids = new ArrayList<>();
                ids.add(iid);
                reversedDictionary.put(inst.label, ids);
            }
            id++;
        }

        int numberOfInstances = id;
        int sizeOfEachFold = (int) Math.floor(numberOfInstances / k);
        Map<String, Double> classRatios = new HashMap<>();
        for (Map.Entry<String, Integer> classFrequency : classes.entrySet()) {
            classRatios.put(classFrequency.getKey(),
                    (double) classFrequency.getValue() / (double) numberOfInstances);
        }

        List<Set<String>> folds = new ArrayList<>();
        for (int i = 0; i < k; i++) {
            Set<String> fold = new HashSet<>();
            for (Map.Entry<String, List<String>> c : reversedDictionary.entrySet()) {
                int currentSize = fold.size();
                int numberRequired = (int) Math.floor(classRatios.get(c.getKey()) * sizeOfEachFold);
                while (fold.size() < currentSize + numberRequired && c.getValue().size() > 0) {
                    int nextPick = rng.nextInt(c.getValue().size());
                    fold.add(c.getValue().get(nextPick));
                    c.getValue().remove(nextPick);
                }
            }
            folds.add(fold);
            if (speak)
                System.out.println("Finished computing fold " + (i + 1) + " of size " + fold.size());
        }

        if (speak)
            System.out.println("Writing folds on disk");

        return Pair.of(folds, dictionary);
    }
}