edu.emory.cci.aiw.cvrg.eureka.etl.spreadsheet.XlsxDataProvider.java Source code

Java tutorial

Introduction

Here is the source code for edu.emory.cci.aiw.cvrg.eureka.etl.spreadsheet.XlsxDataProvider.java

Source

/*
 * #%L
 * Eureka Services
 * %%
 * Copyright (C) 2012 - 2013 Emory University
 * %%
 * This program is dual licensed under the Apache 2 and GPLv3 licenses.
 * 
 * Apache License, Version 2.0:
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * GNU General Public License version 3:
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-3.0.html>.
 * #L%
 */
package edu.emory.cci.aiw.cvrg.eureka.etl.spreadsheet;

import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.MessageFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

import org.apache.poi.hssf.util.CellReference;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An implementation of the {@link DataProvider} interface, using an Excel
 * workbook as the data source.
 *
 * @author hrathod
 *
 */
public class XlsxDataProvider implements DataProvider {

    /**
     * The standard date format for data held in the workbook.
     */
    private static ThreadLocal<DateFormat> dateFormat = new ThreadLocal<DateFormat>() {
        @Override
        protected SimpleDateFormat initialValue() {
            return new SimpleDateFormat("yyyy.MM.dd HH:mm:ss");
        }
    };
    /**
     * The class level logger.
     */
    private static final Logger LOGGER = LoggerFactory.getLogger(XlsxDataProvider.class);
    /**
     * Log message template for a missing required worksheet. Takes one
     * parameter, the worksheet name.
     */
    private final MessageFormat logErrorMessage;
    /**
     * Exception message for a missing required worksheet. Takes one parameter,
     * the worksheet name.
     */
    private final MessageFormat exceptionErrorMessage;
    /**
     * A handle to the input stream so we can close it.
     */
    private final OPCPackage opcPackage;
    /**
     * Holds the workbook associated with the data file.
     */
    private final XSSFWorkbook workbook;
    /**
     * The list of patients parsed from the data file.
     */
    private List<Patient> patients;
    /**
     * The list of providers parsed from the data file.
     */
    private List<Provider> providers;
    /**
     * The list of encounters parsed from the data file.
     */
    private List<Encounter> encounters;
    /**
     * The list of CPT codes parsed from the data file.
     */
    private List<CPT> cpts;
    /**
     * The list of ICD9 Diagnostic codes parsed from the data file.
     */
    private List<Icd9Diagnosis> icd9Diagnoses;
    /**
     * The list of ICD9 Procedure codes parsed from the data file.
     */
    private List<Icd9Procedure> icd9Procedures;
    /**
     * The list of medications parsed from the data file.
     */
    private List<Medication> medications;
    /**
     * The list of labs parsed from the data file.
     */
    private List<Lab> labs;
    /**
     * The list of vitals parsed from the data file.
     */
    private List<Vital> vitals;
    private final ResourceBundle messages;

    private final File dataFile;

    /**
     * Create the data provider from the given data file.
     *
     * @param inDataFile The Excel workbook file to use as the data store.
     * @throws DataProviderException Thrown when the workbook can not be
     * accessed, or parsed correctly.
     */
    public XlsxDataProvider(File inDataFile, Locale locale) throws DataProviderException {
        if (inDataFile == null) {
            throw new IllegalArgumentException("inDataFile cannot be null");
        }

        this.logErrorMessage = new MessageFormat(MessageFormat.format("Spreadsheet {0} is missing required sheet",
                new Object[] { inDataFile.getAbsolutePath() }) + " {0}");
        this.exceptionErrorMessage = new MessageFormat("Required worksheet {0} is missing");

        try {
            LOGGER.debug("Creating workbook from {}", inDataFile.getAbsolutePath());
            this.opcPackage = OPCPackage.open(inDataFile.getAbsolutePath(), PackageAccess.READ);
            this.workbook = new XSSFWorkbook(this.opcPackage);
            this.validateWorksheets();
        } catch (InvalidFormatException | InvalidOperationException ex) {
            throw new DataProviderException("Invalid XLSX file", ex);
        } catch (IOException ioe) {
            throw new DataProviderException("Error reading XLSX file", ioe);
        }
        this.messages = locale != null ? ResourceBundle.getBundle("Messages", locale)
                : ResourceBundle.getBundle("Messages");
        this.dataFile = inDataFile;
    }

    /**
     * Make sure that all the necessary worksheets are present in the given
     * workbook.
     *
     * @throws DataProviderException Thrown if there are missing worksheets in
     * the given workbook.
     */
    private void validateWorksheets() throws DataProviderException {
        String[] requiredSheetNames = new String[] { "patient", "provider", "encounter" };
        for (String sheetName : requiredSheetNames) {
            XSSFSheet sheet = this.workbook.getSheet(sheetName);
            if (sheet == null) {
                throw new DataProviderException(this.exceptionErrorMessage.format(new String[] { sheetName }));
            }
        }
    }

    public File getDataFile() {
        return dataFile;
    }

    @Override
    public List<Patient> getPatients() throws DataProviderException {
        if (this.patients == null) {
            this.patients = this.readPatients();
        }
        return this.patients;
    }

    @Override
    public List<Provider> getProviders() throws DataProviderException {
        if (this.providers == null) {
            this.providers = this.readProviders();
        }
        return this.providers;
    }

    @Override
    public List<Encounter> getEncounters() throws DataProviderException {
        if (this.encounters == null) {
            this.encounters = this.readEncounters();
        }
        return this.encounters;
    }

    @Override
    public List<CPT> getCptCodes() throws DataProviderException {
        if (this.cpts == null) {
            this.cpts = this.readCpts();
        }
        return this.cpts;
    }

    @Override
    public List<Icd9Diagnosis> getIcd9Diagnoses() throws DataProviderException {
        if (this.icd9Diagnoses == null) {
            this.icd9Diagnoses = this.readIcd9Diagnoses();
        }
        return this.icd9Diagnoses;
    }

    @Override
    public List<Icd9Procedure> getIcd9Procedures() throws DataProviderException {
        if (this.icd9Procedures == null) {
            this.icd9Procedures = this.readIcd9Procedures();
        }
        return this.icd9Procedures;
    }

    @Override
    public List<Medication> getMedications() throws DataProviderException {
        if (this.medications == null) {
            this.medications = this.readMedications();
        }
        return this.medications;
    }

    @Override
    public List<Lab> getLabs() throws DataProviderException {
        if (this.labs == null) {
            this.labs = this.readLabs();
        }
        return this.labs;
    }

    @Override
    public List<Vital> getVitals() throws DataProviderException {
        if (this.vitals == null) {
            this.vitals = this.readVitals();
        }
        return this.vitals;
    }

    @Override
    public void close() throws IOException {
        if (this.opcPackage != null) {
            this.opcPackage.revert(); //Close without saving.
        }
    }

    /**
     * Convenience method to read a required sheet and throw an exception if the
     * sheet is not present.
     *
     * @param sheetName the sheet's name.
     * @return the sheet, guaranteed not <code>null</code>.
     * @throws DataProviderException if the sheet is not present.
     */
    private XSSFSheet readRequiredSheet(String sheetName) throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet(sheetName);
        if (sheet == null) {
            LOGGER.error(this.logErrorMessage.format(sheetName));
            throw new DataProviderException(this.exceptionErrorMessage.format(sheetName));
        }
        return sheet;
    }

    /**
     * Parse the list of patients from the workbook.
     *
     * @return A list of {@link Patient} objects.
     */
    private List<Patient> readPatients() throws DataProviderException {
        XSSFSheet sheet = readRequiredSheet("patient");
        String sheetName = sheet.getSheetName();
        List<Patient> result = new ArrayList<>();
        Iterator<Row> rows = sheet.rowIterator();
        rows.next(); // skip header row
        while (rows.hasNext()) {
            Row row = rows.next();
            Patient patient = new Patient();
            patient.setId(readLongValue(sheetName, row.getCell(0)));
            patient.setFirstName(readStringValue(sheetName, row.getCell(1)));
            patient.setLastName(readStringValue(sheetName, row.getCell(2)));
            patient.setDateOfBirth(readDateValue(sheetName, row.getCell(3)));
            patient.setLanguage(readStringValue(sheetName, row.getCell(4)));
            patient.setMaritalStatus(readStringValue(sheetName, row.getCell(5)));
            patient.setRace(readStringValue(sheetName, row.getCell(6)));
            patient.setGender(readStringValue(sheetName, row.getCell(7)));
            result.add(patient);
        }
        return result;
    }

    /**
     * Parse the list of providers in the workbook.
     *
     * @return A list of {@link Provider} objects.
     */
    private List<Provider> readProviders() throws DataProviderException {
        XSSFSheet sheet = readRequiredSheet("provider");
        String sheetName = sheet.getSheetName();
        List<Provider> result = new ArrayList<>();
        Iterator<Row> rows = sheet.rowIterator();
        rows.next(); // skip header row
        while (rows.hasNext()) {
            Row row = rows.next();
            Provider provider = new Provider();
            provider.setId(readLongValue(sheetName, row.getCell(0)));
            provider.setFirstName(readStringValue(sheetName, row.getCell(1)));
            provider.setLastName(readStringValue(sheetName, row.getCell(2)));
            result.add(provider);
        }
        return result;
    }

    /**
     * Parse the list of encounters in the workbook.
     *
     * @return A list of {@link Encounter} objects.
     */
    private List<Encounter> readEncounters() throws DataProviderException {
        XSSFSheet sheet = readRequiredSheet("encounter");
        String sheetName = sheet.getSheetName();
        List<Encounter> result = new ArrayList<>();
        Iterator<Row> rows = sheet.rowIterator();
        LOGGER.debug("Encounter iterator: {}", Integer.valueOf(rows.hashCode()));
        rows.next(); // skip header row
        while (rows.hasNext()) {
            Row row = rows.next();
            Encounter encounter = new Encounter();
            encounter.setId(readLongValue(sheetName, row.getCell(0)));
            encounter.setPatientId(readLongValue(sheetName, row.getCell(1)));
            encounter.setProviderId(readLongValue(sheetName, row.getCell(2)));
            encounter.setStart(readDateValue(sheetName, row.getCell(3)));
            encounter.setEnd(readDateValue(sheetName, row.getCell(4)));
            encounter.setType(readStringValue(sheetName, row.getCell(5)));
            encounter.setDischargeDisposition(readStringValue(sheetName, row.getCell(6)));
            result.add(encounter);
        }
        return result;
    }

    /**
     * Parse the list of CPT codes in the workbook.
     *
     * @return A list of {@link CPT} objects.
     */
    private List<CPT> readCpts() throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet("eCPT");
        List<CPT> result = new ArrayList<>();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Iterator<Row> rows = sheet.rowIterator();
            rows.next(); // skip header row
            while (rows.hasNext()) {
                Row row = rows.next();
                CPT cpt = new CPT();
                cpt.setId(readStringValue(sheetName, row.getCell(0)));
                cpt.setEncounterId(readLongValue(sheetName, row.getCell(1)));
                cpt.setTimestamp(readDateValue(sheetName, row.getCell(2)));
                cpt.setEntityId(readStringValue(sheetName, row.getCell(3)));
                result.add(cpt);
            }
        }
        return result;
    }

    /**
     * Parse the list of ICD9 Diagnostic codes present in the workbook.
     *
     * @return A list of {@link Icd9Diagnosis} objects.
     */
    private List<Icd9Diagnosis> readIcd9Diagnoses() throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet("eICD9D");
        List<Icd9Diagnosis> result = new ArrayList<>();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Iterator<Row> rows = sheet.rowIterator();
            rows.next(); // skip header row
            Map<Long, Integer> ranksByEncounter = new HashMap<>();
            while (rows.hasNext()) {
                Row row = rows.next();
                Icd9Diagnosis diagnosis = new Icd9Diagnosis();
                diagnosis.setId(readStringValue(sheetName, row.getCell(0)));
                Long encId = readLongValue(sheetName, row.getCell(1));
                diagnosis.setEncounterId(encId);
                diagnosis.setTimestamp(readDateValue(sheetName, row.getCell(2)));
                diagnosis.setEntityId(readStringValue(sheetName, row.getCell(3)));
                Integer rank = ranksByEncounter.get(encId);
                if (rank == null) {
                    diagnosis.setRank(1);
                    ranksByEncounter.put(encId, 1);
                } else {
                    int newRank = rank + 1;
                    diagnosis.setRank(newRank);
                    ranksByEncounter.put(encId, newRank);
                }
                result.add(diagnosis);
            }
        }
        return result;
    }

    /**
     * Parse the list of ICD9 Procedure codes present in the workbook.
     *
     * @return A list of {@link Icd9Procedure} objects.
     */
    private List<Icd9Procedure> readIcd9Procedures() throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet("eICD9P");
        List<Icd9Procedure> result = new ArrayList<>();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Iterator<Row> rows = sheet.rowIterator();
            rows.next(); // skip header row
            while (rows.hasNext()) {
                Row row = rows.next();
                Icd9Procedure procedure = new Icd9Procedure();
                procedure.setId(readStringValue(sheetName, row.getCell(0)));
                procedure.setEncounterId(readLongValue(sheetName, row.getCell(1)));
                procedure.setTimestamp(readDateValue(sheetName, row.getCell(2)));
                procedure.setEntityId(readStringValue(sheetName, row.getCell(3)));
                result.add(procedure);
            }
        }
        return result;
    }

    /**
     * Parse the list of medications present in the workbook.
     *
     * @return A list of {@link Medication} objects.
     */
    private List<Medication> readMedications() throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet("eMEDS");
        List<Medication> result = new ArrayList<>();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Iterator<Row> rows = sheet.rowIterator();
            rows.next(); // skip header row
            while (rows.hasNext()) {
                Row row = rows.next();
                Medication medication = new Medication();
                medication.setId(readStringValue(sheetName, row.getCell(0)));
                medication.setEncounterId(readLongValue(sheetName, row.getCell(1)));
                medication.setTimestamp(readDateValue(sheetName, row.getCell(2)));
                medication.setEntityId(readStringValue(sheetName, row.getCell(3)));
                result.add(medication);
            }
        }
        return result;
    }

    /**
     * Parse the list of labs present in the workbook's "eLABS" worksheet.
     *
     * @return A list of {@link Lab} objects.
     */
    private List<Lab> readLabs() throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet("eLABS");
        List<Lab> result = new ArrayList<>();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Iterator<Row> rows = sheet.rowIterator();
            rows.next(); // skip header row
            while (rows.hasNext()) {
                Row row = rows.next();
                Lab lab = new Lab();
                lab.setId(readStringValue(sheetName, row.getCell(0)));
                lab.setEncounterId(readLongValue(sheetName, row.getCell(1)));
                lab.setTimestamp(readDateValue(sheetName, row.getCell(2)));
                lab.setEntityId(readStringValue(sheetName, row.getCell(3)));
                lab.setResultAsStr(readStringValue(sheetName, row.getCell(4)));
                lab.setResultAsNum(readDoubleValue(sheetName, row.getCell(5)));
                lab.setUnits(readStringValue(sheetName, row.getCell(6)));
                lab.setFlag(readStringValue(sheetName, row.getCell(7)));
                result.add(lab);
            }
        }
        return result;
    }

    /**
     * Parse the list of vitals present in the workbook's "eVITALS" worksheet.
     *
     * @return A list of {@link Vital} objects.
     */
    private List<Vital> readVitals() throws DataProviderException {
        XSSFSheet sheet = this.workbook.getSheet("eVITALS");
        List<Vital> result = new ArrayList<>();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Iterator<Row> rows = sheet.rowIterator();
            rows.next(); // skip header row
            while (rows.hasNext()) {
                Row row = rows.next();
                Vital vital = new Vital();
                vital.setId(readStringValue(sheetName, row.getCell(0)));
                vital.setEncounterId(readLongValue(sheetName, row.getCell(1)));
                vital.setTimestamp(readDateValue(sheetName, row.getCell(2)));
                vital.setEntityId(readStringValue(sheetName, row.getCell(3)));
                vital.setResultAsStr(readStringValue(sheetName, row.getCell(4)));
                vital.setResultAsNum(readDoubleValue(sheetName, row.getCell(5)));
                vital.setUnits(readStringValue(sheetName, row.getCell(6)));
                vital.setFlag(readStringValue(sheetName, row.getCell(7)));
                result.add(vital);
            }
        }
        return result;
    }

    /**
     * Read a date value from the given spreadsheet cell.
     *
     * @param cell The cell to read the value from.
     * @return The date in the cell, if valid, null otherwise.
     */
    private Date readDateValue(String sheetName, Cell cell) throws DataProviderException {
        Date result = null;
        String value = readStringValue(sheetName, cell);
        if (value != null) {
            try {
                result = dateFormat.get().parse(value);
            } catch (ParseException e) {
                throwException(sheetName, cell, e.getMessage(), e);
            }
        }
        return result;
    }

    /**
     * Read a string value from the given cell.
     *
     * @param cell The cell to read value from.
     * @return A String containing the cell value, if valid, null otherwise.
     */
    private String readStringValue(String sheetName, Cell cell) throws DataProviderException {
        String result = null;
        if (cell != null) {
            try {
                int cellType = cell.getCellType();
                if (cellType == Cell.CELL_TYPE_STRING) {
                    result = cell.getStringCellValue();
                } else if (cellType == Cell.CELL_TYPE_NUMERIC) {
                    result = Double.toString(cell.getNumericCellValue());
                } else if (cellType == Cell.CELL_TYPE_BOOLEAN) {
                    result = Boolean.toString(cell.getBooleanCellValue());
                } else {
                    throwException(sheetName, cell, "Cell type must be a number, string, boolean or blank");
                }
            } catch (Exception e) {
                throwException(sheetName, cell, e.getMessage(), e);
            }
        }
        return result;
    }

    private void throwException(String sheetName, Cell cell, String problemDescription)
            throws DataProviderException {
        throwException(sheetName, cell, problemDescription, null);
    }

    private void throwException(String sheetName, Cell cell, String problemDescription, Exception cause)
            throws DataProviderException {
        String msgTemplate = messages.getString("xlsxDataProvider.error.parsing");
        CellReference cellRef = new CellReference(cell.getRowIndex(), cell.getColumnIndex());
        String msg = MessageFormat.format(msgTemplate, sheetName, cellRef.formatAsString(), problemDescription);
        throw new DataProviderException(msg, cause);
    }

    /**
     * Read a numerical value as a Long type from the given cell.
     *
     * @param cell The cell to read the value from.
     * @return A Long containing the cell's value, if valid, null otherwise.
     */
    private Long readLongValue(String sheetName, Cell cell) throws DataProviderException {
        Long result = null;
        if (cell != null) {
            try {
                Double value = new Double(cell.getNumericCellValue());
                result = Long.valueOf(value.longValue());
            } catch (Exception e) {
                throwException(sheetName, cell, e.getMessage(), e);
            }
        }
        return result;
    }

    /**
     * Read the give cell's value as a Double type.
     *
     * @param cell The cell to read the value from.
     * @return A Double containing the cell value, if valid, null otherwise.
     */
    private Double readDoubleValue(String sheetName, Cell cell) throws DataProviderException {
        Double result = null;
        if (cell != null) {
            try {
                double value = cell.getNumericCellValue();
                result = Double.valueOf(value);
            } catch (Exception e) {
                throwException(sheetName, cell, e.getMessage(), e);
            }
        }
        return result;
    }
}