com.rcv.StreamingCVRReader.java Source code

Java tutorial

Introduction

Here is the source code for com.rcv.StreamingCVRReader.java

Source

/*
 * Ranked Choice Voting Universal Tabulator
 * Copyright (c) 2018 Jonathan Moldover, Louis Eisenberg, and Hylton Edingfield
 *
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 * the GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License along with this
 * program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Purpose:
 * Helper class to read and parse an xls cast vote record file into cast vote record objects.
 */

package com.rcv;

import com.rcv.RawContestConfig.CVRSource;
import java.io.File;
import java.io.IOException;
import java.security.InvalidParameterException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import javafx.util.Pair;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

class StreamingCVRReader {

    // config for the contest
    private final ContestConfig config;
    // path of the source file
    private final String excelFilePath;
    // name of the source file
    private final String excelFileName;
    // column index of first ranking
    private final int firstVoteColumnIndex;
    // row index of first CVR
    private final int firstVoteRowIndex;
    // column index of CVR ID (if present)
    private final Integer idColumnIndex;
    // column index of currentPrecinct name (if present)
    private final Integer precinctColumnIndex;
    // map for tracking unrecognized candidates during parsing
    private final Map<String, Integer> unrecognizedCandidateCounts = new HashMap<>();
    // used for generating CVR IDs
    private int cvrIndex = 0;
    // list of currentRankings for CVR in progress
    private LinkedList<Pair<Integer, String>> currentRankings;
    // list of raw strings for CVR in progress
    private LinkedList<String> currentCVRData;
    // supplied CVR ID for CVR in progress
    private String currentSuppliedCVRID;
    // precinct ID for CVR in progress
    private String currentPrecinct;
    // place to store input CVR list (new CVRs will be appended as we parse)
    private List<CastVoteRecord> cvrList;
    // store precinctIDs (new IDs will be added as we parse)
    private Set<String> precinctIDs;

    // last rankings cell observed for CVR in progress
    private int lastRankSeen;

    // function: StreamingCVRReader
    // purpose: class constructor
    // param: config an ContestConfig object specifying rules for interpreting CVR file data
    // param: source file to read
    StreamingCVRReader(ContestConfig config, CVRSource source) {
        this.config = config;
        this.excelFilePath = config.resolveConfigPath(source.getFilePath());
        // cvrFileName for generating cvrIDs
        this.excelFileName = new File(excelFilePath).getName();

        // to keep our code simple, we convert 1-indexed user-supplied values to 0-indexed here
        this.firstVoteColumnIndex = source.getFirstVoteColumnIndex() - 1;
        this.firstVoteRowIndex = source.getFirstVoteRowIndex() - 1;
        this.idColumnIndex = source.getIdColumnIndex() != null ? source.getIdColumnIndex() - 1 : null;
        this.precinctColumnIndex = source.getPrecinctColumnIndex() != null ? source.getPrecinctColumnIndex() - 1
                : null;
    }

    // given Excel-style address string return the cell address as a pair of Integers
    // representing zero-based column and row of the cell address
    private static Pair<Integer, Integer> getCellAddress(String address) {
        // this regex will parse a string into
        // a sequence of one or more non-digits followed by a sequence of one or more digits
        // and store these substrings into addressParts array
        String[] addressParts = address.split("(?<=\\D)(?=\\d)");
        if (addressParts.length != 2) {
            Logger.log(Level.SEVERE, "Invalid cell address: %s", address);
            throw new InvalidParameterException();
        }
        // row is the 0-based row of the cell
        Integer row = Integer.parseInt(addressParts[1]) - 1;
        // col is the 0-based column of the cell
        Integer col = getColumnIndex(addressParts[0]);
        // return the result as a Pair
        return new Pair<>(col, row);
    }

    // function: getColumnIndex
    // purpose: given alphabetic representation of an Excel columnAddress returns the zero-based
    // integer index of the column, e.g. "A" returns 0 and "AB" returns 27
    // param: columnAddress the column portion of an Excel cell address string
    // return: column index
    private static int getColumnIndex(String columnAddress) {
        // result is column index
        int result = 0;
        // i indexes over the "digits" of the columnAddress string
        for (int i = 0; i < columnAddress.length(); i++) {
            // at each iteration the current total will be multiplied by 26, "shifting" it left one place
            result *= 26;
            // charValue maps the current character to a value between 1 and 26
            int charValue = columnAddress.charAt(i) - '@';
            if (charValue < 1 || charValue > 26) {
                Logger.log(Level.SEVERE, "Invalid cell address: %s", columnAddress);
                throw new InvalidParameterException();
            }
            result += charValue;
        }
        // finally subtract one to convert to zero-based index
        return result - 1;
    }

    // function: handleEmptyCells
    // purpose: Handle empty cells encountered while parsing a CVR.  Unlike empty rows, empty cells
    // do not trigger parsing callbacks so their existence must be inferred and handled when they
    // occur in a rankings cell.
    // param: currentRank the rank at which we stop inferring empty cells.
    private void handleEmptyCells(int currentRank) {
        // rank iterates between lastRankSeen and currentRank adding audit data and UWI rankings
        for (int rank = lastRankSeen + 1; rank < currentRank; rank++) {
            // add data to audit log
            currentCVRData.add("empty cell");
            // add UWI ranking if required by settings
            if (config.isTreatBlankAsUndeclaredWriteInEnabled()) {
                Logger.log(Level.WARNING, "Empty cell! Treating as UWI.");
                // add the new ranking
                currentRankings.add(new Pair<>(rank, config.getUndeclaredWriteInLabel()));
            }
        }
    }

    // function: beginCVR
    // purpose: prepare to begin parsing a new CVR
    private void beginCVR() {
        // setup data structures for parsing a new CVR
        cvrIndex++;
        currentRankings = new LinkedList<>();
        currentCVRData = new LinkedList<>();
        currentSuppliedCVRID = null;
        currentPrecinct = null;
        lastRankSeen = 0;
    }

    // function: endCVR
    // purpose: complete construction of new CVR object
    private void endCVR() {
        // handle any empty cells which may appear at the end of this row
        handleEmptyCells(config.getMaxRankingsAllowed() + 1);
        // determine what the new cvr ID will be
        String computedCastVoteRecordID = String.format("%s(%d)", excelFileName, cvrIndex);
        // create new cast vote record
        CastVoteRecord newRecord = new CastVoteRecord(computedCastVoteRecordID, currentSuppliedCVRID,
                currentPrecinct, currentCVRData, currentRankings);
        // add it to overall list
        cvrList.add(newRecord);
        // add precinct ID if one was found
        if (currentPrecinct != null) {
            precinctIDs.add(currentPrecinct);
        }
        // provide some user feedback on the CVR count
        if (cvrList.size() % 50000 == 0) {
            Logger.log(Level.INFO, String.format("Parsed %d cast vote records.", cvrList.size()));
        }
    }

    // function: cvrCell
    // purpose: handle CVR cell data callback
    // param: col column of this cell
    // param: cellData data contained in this cell
    private void cvrCell(int col, String cellData) {

        // add cell data to "full" audit string
        currentCVRData.add(cellData);

        // check for a currentPrecinct string or CVR ID string
        if (precinctColumnIndex != null && col == precinctColumnIndex) {
            currentPrecinct = cellData;
        } else if (idColumnIndex != null && col == idColumnIndex) {
            currentSuppliedCVRID = cellData;
        }

        // see if this column is in the ranking range
        if (col >= firstVoteColumnIndex && col < firstVoteColumnIndex + config.getMaxRankingsAllowed()) {

            // rank for this column
            Integer currentRank = col - firstVoteColumnIndex + 1;
            // handle any empty cells which may exist between this cell and any previous one
            handleEmptyCells(currentRank);
            // get the candidate name
            String candidate = cellData.trim();
            // skip undervotes
            if (!candidate.equals(config.getUndervoteLabel())) {
                // map overvotes to our internal overvote string
                if (candidate.equals(config.getOvervoteLabel())) {
                    candidate = Tabulator.EXPLICIT_OVERVOTE_LABEL;
                } else if (!config.getCandidateCodeList().contains(candidate)
                        && !candidate.equals(config.getUndeclaredWriteInLabel())) {
                    // this is an unrecognized candidate so add it to the unrecognized candidate map
                    // this helps identify problems with CVRs
                    unrecognizedCandidateCounts.merge(candidate, 1, Integer::sum);
                }
                // create and add the new ranking
                Pair<Integer, String> ranking = new Pair<>(currentRank, candidate);
                currentRankings.add(ranking);
            }
            // update lastRankSeen - used to handle empty ranking cells
            lastRankSeen = currentRank;
        }
    }

    // function: parseCVRFile
    // purpose: parse the given file into a List of CastVoteRecords for tabulation
    // param: castVoteRecords existing list to append new CastVoteRecords to
    // param: precinctIDs existing set of precinctIDs discovered during CVR parsing
    // returns: list of parsed CVRs
    List<CastVoteRecord> parseCVRFile(List<CastVoteRecord> castVoteRecords, Set<String> precinctIDs)
            throws UnrecognizedCandidatesException, OpenXML4JException, SAXException, IOException {

        // cache the cvr list so it is accessible in callbacks
        cvrList = castVoteRecords;
        // cache precinctIDs set so it is accessible in callbacks
        this.precinctIDs = precinctIDs;

        // open the zip package
        OPCPackage pkg = OPCPackage.open(excelFilePath);
        // pull out strings
        ReadOnlySharedStringsTable sharedStrings = new ReadOnlySharedStringsTable(pkg);
        // XSSF reader is used to extract styles data
        XSSFReader xssfReader = new XSSFReader(pkg);
        // styles data is used for creating ContentHandler
        StylesTable styles = xssfReader.getStylesTable();
        // SheetContentsHandler is used to handle parsing callbacks
        SheetContentsHandler sheetContentsHandler = new SheetContentsHandler() {
            // function: startRow
            // purpose: startRow callback handler during xml parsing
            // param: i the row which has started
            @Override
            public void startRow(int i) {
                if (i >= firstVoteRowIndex) {
                    beginCVR();
                }
            }

            // function: endRow
            // purpose: endRow callback handler during xml parsing
            // row has completed, we will create a new cvr object
            // param: i the row which has ended
            @Override
            public void endRow(int i) {
                if (i >= firstVoteRowIndex) {
                    endCVR();
                }
            }

            // function: cell
            // purpose: cell callback handler during xml parsing
            // param: s cell address encoded as col,row
            // param: s1 cell data
            // param: xssfComment additional cell data (unused)
            @Override
            public void cell(String s, String s1, XSSFComment xssfComment) {
                // address contains the row and col of this cell
                Pair<Integer, Integer> address = getCellAddress(s);
                int col = address.getKey();
                int row = address.getValue();
                if (row >= firstVoteRowIndex) {
                    cvrCell(col, s1);
                }
            }

            // function: headerFooter
            // purpose: header footer callback from xml parsing - unused
            // param: s header footer data
            // param: b header footer data
            // param: s1 header footer data
            @Override
            public void headerFooter(String s, boolean b, String s1) {
                Logger.log(Level.WARNING, String.format("Unexpected XML data: %s %b %s", s, b, s1));
            }
        };

        // create the ContentHandler to handle parsing callbacks
        ContentHandler handler = new XSSFSheetXMLHandler(styles, sharedStrings, sheetContentsHandler, true);

        // create the XML reader and set content handler
        XMLReader xmlReader = XMLReaderFactory.createXMLReader();
        xmlReader.setContentHandler(handler);
        // trigger parsing
        xmlReader.parse(new InputSource(xssfReader.getSheetsData().next()));

        // throw if there were any unrecognized candidates -- this is considered bad
        if (this.unrecognizedCandidateCounts.size() > 0) {
            throw new UnrecognizedCandidatesException(unrecognizedCandidateCounts);
        }

        // return the input list with additions
        return cvrList;
    }

    // exception class used when an unrecognized candidate is encountered during cvr parsing
    // purpose is to help identify issues with cast vote record files and configuration files
    static class UnrecognizedCandidatesException extends Exception {

        // candidateCounts maps an unrecognized candidate name to the count of how many times it was
        // encountered during CVR parsing
        final Map<String, Integer> candidateCounts;

        // function: UnrecognizedCandidatesException
        // purpose: constructor
        // param: candidateCounts maps unrecognized candidates to the number of times they were
        // encountered during parsing
        UnrecognizedCandidatesException(Map<String, Integer> candidateCounts) {
            this.candidateCounts = candidateCounts;
        }
    }
}