javalibs.CSVDataNormalizer.java Source code

Introduction

Here is the source code for javalibs.CSVDataNormalizer.java
Source

package javalibs;
/**
 * Copyright (javalibs.c) 2018 Sean Grimes. All rights reserved.
 * @author Sean Grimes, spg63@drexel.edu
 * @since 12/20/18
 * License: MIT License
 */

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;

import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;

@SuppressWarnings({ "unchecked", "SpellCheckingInspection", "unused", "UnusedAssignment", "ConstantConditions" })
public class CSVDataNormalizer {
    private String csvPath;
    private List<String> columnsToNormalize;
    private Map<String, List<String>> columnsWithLinkings;
    private boolean linkingsExist = false;
    private List<CSVRecord> allRecords = new ArrayList();
    private Map<String, Integer> headerMap = new HashMap();
    private TSL log_ = TSL.get();
    private Map<String, Pair<Double, Double>> colsToMaxMinPairs = new HashMap();
    private Map<Integer, String> colNumToName = new HashMap();
    private int numCols = 0;
    private String[] headersInOrder;
    private String savePath;

    /**
     *
     * @param pathToCSV The path to the CSV file
     * @param columns A list of column names that need to be normalized. This assumes that no
     *                columns are dependent. I.e. min and max values for each item in a
     *                column will come from the column they're already in
     */
    public CSVDataNormalizer(String pathToCSV, List<String> columns) {
        this.csvPath = pathToCSV;
        this.columnsToNormalize = columns;
        readCSV();
        getAllMinMaxValues();
        this.savePath = pathToCSV;
    }

    /**
     * See above for full description
     * @param existingCSV The path to the CSV file
     * @param savePath Path to save the normalized CSV
     * @param columns A list of column names that need to be normalized. This assumes that no
     *                columns are dependent. I.e. min and max values for each item in a column
     *                will come from the column they're already in
     */
    public CSVDataNormalizer(String existingCSV, String savePath, List<String> columns) {
        this(existingCSV, columns);
        this.savePath = savePath;
    }

    /**
     * We have a CSV file that has budget and revenue data for movies. It makes sense that the max
     * monetary value comes from either the budget or revenue column, same with the min value
     * -- otherwise, if there exists no budget that is greater than revenue, a revenue value with
     * a higher non-normalized value may be have a smaller normalized value.
     * @param pathToCSV The path to the CSV file
     * @param columnsWithLinkings A map of column names that map to a list of columns that
     *                            are linked together when determining max and min values for
     *                            normalization
     */
    public CSVDataNormalizer(String pathToCSV, Map<String, List<String>> columnsWithLinkings) {
        this.csvPath = pathToCSV;
        this.columnsWithLinkings = columnsWithLinkings;
        this.columnsToNormalize = new ArrayList();
        this.linkingsExist = true;
        readCSV();
        getAllMinMaxValues();
        this.savePath = pathToCSV;
    }

    /**
     * See above for full description.
     * @param existingCSV Path to the existing CSV file
     * @param savePath Path to save the normalized CSV
     * @param columnsWithLinkings Map of column names that map to a list of columns
     *                            that are linked together when determining max and min values
     *                            for normalization
     */
    public CSVDataNormalizer(String existingCSV, String savePath, Map<String, List<String>> columnsWithLinkings) {
        this(existingCSV, columnsWithLinkings);
        this.savePath = savePath;
    }

    public void normalize() {
        BufferedWriter bw = null;
        CSVPrinter printer = null;

        try {
            bw = Files.newBufferedWriter(Paths.get(this.savePath));
            printer = new CSVPrinter(bw, CSVFormat.DEFAULT.withHeader(this.headersInOrder));
        } catch (IOException e) {
            log_.die(e);
        }

        for (CSVRecord rec : this.allRecords) {
            List<String> writerCells = new ArrayList<>();
            for (int i = 0; i < this.numCols; ++i) {
                String colName = this.colNumToName.get(i);
                if (columnsToNormalize.contains(colName)) {
                    double curVal = NumUtils.getDoubleFromStr(rec.get(colName));
                    Pair<Double, Double> maxMin = this.colsToMaxMinPairs.get(colName);
                    double normal = NumUtils.normalizeBetweenZeroOne(maxMin.right(), maxMin.left(), curVal);
                    if (normal > 1.0) {
                        log_.warn("Normalized value greater than 1.0: " + normal + " from curVal: " + curVal
                                + " setting normal to 1.");
                        normal = 1.0;
                    } else if (normal < 0.0) {
                        log_.warn("Normalized value less than 0.0: " + normal + " from curVal : " + curVal
                                + " setting normal to 0.");
                        normal = 0.0;
                    }

                    writerCells.add(Double.toString(normal));
                } else
                    writerCells.add(rec.get(i));
            }
            try {
                printer.printRecord(writerCells.toArray());
            } catch (IOException e) {
                log_.die(e);
            }
        }
        try {
            printer.flush();
        } catch (IOException e) {
            log_.die(e);
        }
    }

    private void getAllMinMaxValues() {
        if (this.linkingsExist) {
            // Go through all of the columns that need to be normalized
            for (String column : this.columnsWithLinkings.keySet()) {
                this.columnsToNormalize.add(column);
                this.colsToMaxMinPairs.put(column, getMaxMinFromLinkedColumns(column));
            }
        } else {
            for (String col : this.columnsToNormalize)
                this.colsToMaxMinPairs.put(col, getMaxMinFromCol(col));
        }
    }

    private Pair getMaxMinFromCol(String columnName) {
        double max = Double.MIN_VALUE;
        double min = Double.MAX_VALUE;
        for (CSVRecord record : this.allRecords) {
            double val = NumUtils.getDoubleFromStr(record.get(columnName));
            // NOTE: Floating point errors aren't really that important here, don't waste time on
            // a proper floating point comparison
            if (val > max)
                max = val;
            if (val < min)
                min = val;
        }

        return new Pair(max, min);
    }

    private Pair getMaxMinFromLinkedColumns(String columnName) {
        List<String> cols = this.columnsWithLinkings.get(columnName);
        // Also need to look at the primary column name
        cols.add(columnName);
        double max = Double.MIN_VALUE;
        double min = Double.MAX_VALUE;

        for (String col : cols) {
            Pair<Double, Double> maxMin = getMaxMinFromCol(col);
            if (maxMin.left() > max)
                max = maxMin.left();
            if (maxMin.right() < min)
                min = maxMin.right();
        }

        return new Pair(max, min);
    }

    private void readCSV() {
        try {
            CSVParser parser = new CSVParser(Files.newBufferedReader(Paths.get(this.csvPath)),
                    CSVFormat.DEFAULT.withHeader().withIgnoreHeaderCase().withTrim());

            // Get all headers in the CSV file so they can be used later when writing the file
            this.headerMap = parser.getHeaderMap();

            // Add them to the records list for later use
            this.allRecords = parser.getRecords();

            parser.close();

            reverseHeaderMap();
        } catch (IOException e) {
            log_.die(e);
        }
    }

    // NOTE: This works because I know there are no repeat values in the hashmap. This is not a
    // generalizable solution.
    private void reverseHeaderMap() {
        for (String colName : this.headerMap.keySet()) {
            this.colNumToName.put(this.headerMap.get(colName), colName);
            ++this.numCols;
        }

        this.headersInOrder = new String[this.numCols];

        for (int i = 0; i < this.numCols; ++i)
            this.headersInOrder[i] = this.colNumToName.get(i);
    }
}