org.ohdsi.jCdmBuilder.EtlReport.java Source code

Java tutorial

Introduction

Here is the source code for org.ohdsi.jCdmBuilder.EtlReport.java

Source

/*******************************************************************************
 * Copyright 2015 Observational Health Data Sciences and Informatics
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.ohdsi.jCdmBuilder;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.ohdsi.jCdmBuilder.utilities.CodeToConceptMap;
import org.ohdsi.jCdmBuilder.utilities.CodeToConceptMap.CodeData;
import org.ohdsi.utilities.StringUtilities;
import org.ohdsi.utilities.collections.CountingSet;
import org.ohdsi.utilities.collections.IntegerComparator;
import org.ohdsi.utilities.collections.OneToManyList;
import org.ohdsi.utilities.files.MultiRowIterator.MultiRowSet;

/**
 * Class for creating an ETL report with details of the ETL, including any errors. Two types of report can be generated: The ETL report itself contains
 * high-level statistics, the Problem report contains patient-specific error messages.
 * 
 * @author MSCHUEMI
 * 
 */
public class EtlReport {

    public static long MAX_REPORT_PROBLEM = 1000;

    private Map<String, Problem> problems = new HashMap<String, Problem>();
    private CountingSet<String> incomingTableRowCounts = new CountingSet<String>();
    private CountingSet<String> outgoingTableRowCounts = new CountingSet<String>();
    private String folder;
    private long totalProblemCount = 0;

    /**
     * 
     * @param folder
     *            Folder where the ETL and Problem report will be generated
     */
    public EtlReport(String folder) {
        this.folder = folder;
    }

    /**
     * Report a problem to be mentioned in the ETL and Problem report
     * 
     * @param table
     *            Name of the relevant table
     * @param problemType
     *            Type of problem. Error counts will be aggregated by problemType in the ETL report
     * @param personId
     *            The person_id for which the problem was found
     */
    public void reportProblem(String table, String problemType, String personId) {
        totalProblemCount++;
        Problem problem = problems.get(table + "\t" + problemType);
        if (problem == null) {
            problem = new Problem();
            problem.problemType = problemType;
            problem.table = table;
            problems.put(table + "\t" + problemType, problem);
        }
        problem.count++;
        if (problem.count < MAX_REPORT_PROBLEM)
            problem.personId.add(personId);
        if (problem.count == MAX_REPORT_PROBLEM)
            System.out.println("Warning: encountered " + MAX_REPORT_PROBLEM + " problems of type '" + problemType
                    + "' in table " + table);
    }

    /**
     * 
     * @return the total number of problems reported during the ETL
     */
    public long getTotalProblemCount() {
        return totalProblemCount;
    }

    /**
     * Generates the patient-specific Problem report
     * 
     * @return The full path to the generated Problem report
     */
    public String generateProblemReport() {
        StringUtilities.outputWithTime("Generating ETL problem list");
        String filename = generateETLProblemListFilename(folder);
        XSSFWorkbook workbook = new XSSFWorkbook();

        XSSFSheet sheet = workbook.createSheet("Problems");
        addRow(sheet, "Table", "Problem", "Person_id");
        for (Problem problem : problems.values()) {
            for (String personId : problem.personId)
                addRow(sheet, problem.table, problem.problemType, personId);
            // for (int i = 0; i < Math.min(MAX_REPORT_PROBLEM, problem.count); i++)
            // addRow(sheet, problem.table, problem.problemType, problem.personId.get(i));
            if (problem.count > MAX_REPORT_PROBLEM)
                addRow(sheet, problem.table, problem.problemType,
                        "in " + (problem.count - MAX_REPORT_PROBLEM) + " other persons");
        }
        try {
            FileOutputStream out = new FileOutputStream(new File(filename));
            workbook.write(out);
            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e.getMessage());
        }
        return filename;
    }

    /**
     * Generates the overall ETL report
     * 
     * @param codeToConceptMaps
     *            the codeToConceptMaps used in the ETL
     * @return the full path to the ETL report
     */
    public String generateETLReport(CodeToConceptMap... codeToConceptMaps) {
        StringUtilities.outputWithTime("Generating ETL report");
        String filename = generateFilename(folder);
        XSSFWorkbook workbook = new XSSFWorkbook();

        XSSFSheet sheet = workbook.createSheet("Overview");
        addRow(sheet, "Source tables");
        addRow(sheet, "");
        addRow(sheet, "Table name", "Number of records");
        for (String table : incomingTableRowCounts)
            addRow(sheet, table, Integer.valueOf(incomingTableRowCounts.getCount(table)));
        addRow(sheet, "");

        addRow(sheet, "CDM tables");
        addRow(sheet, "");
        addRow(sheet, "Table name", "Number of records");
        for (String table : outgoingTableRowCounts)
            addRow(sheet, table, Integer.valueOf(outgoingTableRowCounts.getCount(table)));
        addRow(sheet, "");

        addRow(sheet, "Number of problems encountered", Long.valueOf(totalProblemCount));
        addRow(sheet, "");
        addRow(sheet, "Mapping", "Mapped unique codes", "Unmapped unique codes", "Mapped total codes",
                "Unmapped total codes");
        for (CodeToConceptMap codeToConceptMap : codeToConceptMaps) {
            int uniqueMapped = 0;
            int uniqueUnmapped = 0;
            long totalMapped = 0;
            long totalUnmapped = 0;
            CountingSet<String> codeCounts = codeToConceptMap.getCodeCounts();
            for (String code : codeCounts) {
                if (!codeToConceptMap.getConceptId(code).equals(0)) {
                    uniqueMapped++;
                    totalMapped += codeCounts.getCount(code);
                } else {
                    uniqueUnmapped++;
                    totalUnmapped += codeCounts.getCount(code);
                }
            }
            addRow(sheet, codeToConceptMap.getName(), Integer.valueOf(uniqueMapped),
                    Integer.valueOf(uniqueUnmapped), Long.valueOf(totalMapped), Long.valueOf(totalUnmapped));
        }

        sheet = workbook.createSheet("Problems");
        addRow(sheet, "Table", "Description", "Nr of rows");
        for (Problem problem : problems.values())
            addRow(sheet, problem.table, problem.problemType, Long.valueOf(problem.count));

        for (CodeToConceptMap codeToConceptMap : codeToConceptMaps) {
            sheet = workbook.createSheet(codeToConceptMap.getName());
            addRow(sheet, "Frequency", "Source code", "Source code description", "Target concept ID", "Target code",
                    "Target concept description");
            CountingSet<String> codeCounts = codeToConceptMap.getCodeCounts();
            List<Map.Entry<String, CountingSet.Count>> codes = new ArrayList<Map.Entry<String, CountingSet.Count>>(
                    codeCounts.key2count.entrySet());
            reverseFrequencySort(codes);
            for (Map.Entry<String, CountingSet.Count> code : codes) {
                CodeData codeData = codeToConceptMap.getCodeData(code.getKey());
                if (codeData == null)
                    addRow(sheet, Integer.valueOf(code.getValue().count), code.getKey(), "", Integer.valueOf(0), "",
                            "");
                else
                    for (int i = 0; i < codeData.targetConceptIds.length; i++)
                        addRow(sheet, Integer.valueOf(code.getValue().count), code.getKey(), codeData.description,
                                Integer.valueOf(codeData.targetConceptIds[i]), codeData.targetCodes[i],
                                codeData.targetDescriptions[i]);
            }
        }

        try {
            FileOutputStream out = new FileOutputStream(new File(filename));
            workbook.write(out);
            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e.getMessage());
        }

        return filename;
    }

    private void reverseFrequencySort(List<Entry<String, CountingSet.Count>> codes) {
        Collections.sort(codes, new Comparator<Entry<String, CountingSet.Count>>() {

            @Override
            public int compare(Entry<String, CountingSet.Count> o1, Entry<String, CountingSet.Count> o2) {
                return -IntegerComparator.compare(o1.getValue().count, o2.getValue().count);
            }
        });

    }

    private void addRow(XSSFSheet sheet, Object... values) {
        Row row = sheet.createRow(sheet.getPhysicalNumberOfRows());
        for (Object value : values) {
            Cell cell = row.createCell(row.getPhysicalNumberOfCells());

            if (value instanceof Integer && value instanceof Long && value instanceof Double)
                cell.setCellValue(Double.valueOf(value.toString()));
            else
                cell.setCellValue(value.toString());

        }
    }

    private String generateFilename(String folder) {
        String filename = folder + "/ETLReport.xlsx";
        int i = 1;
        while (new File(filename).exists())
            filename = folder + "/ETLReport" + (i++) + ".xlsx";
        return filename;
    }

    private String generateETLProblemListFilename(String folder) {
        String filename = folder + "/ETLProblems.xlsx";
        int i = 1;
        while (new File(filename).exists())
            filename = folder + "/ETLPRoblems" + (i++) + ".xlsx";
        return filename;
    }

    /**
     * Register source data of 1 person. Use this function for datasets with 1 table, and 1 row per person.
     * 
     * @param table
     *            Name of the source table
     * @param row
     *            Row in the source table containing all data for 1 person
     */
    public void registerIncomingData(String table, org.ohdsi.utilities.files.Row row) {
        incomingTableRowCounts.add(table, 1);
    }

    /**
     * Register source data of 1 person
     * 
     * @param multiRowSet
     *            The set of rows in one or several tables containing all data for 1 person
     */

    public void registerIncomingData(MultiRowSet data) {
        for (String table : data.keySet())
            incomingTableRowCounts.add(table, data.get(table).size());
    }

    public void registerOutgoingData(OneToManyList<String, org.ohdsi.utilities.files.Row> data) {
        for (String table : data.keySet())
            outgoingTableRowCounts.add(table, data.get(table).size());
    }

    private class Problem {
        public String table;
        public String problemType;
        public long count = 0;
        public List<String> personId = new ArrayList<String>();
    }
}