org.kurator.validation.actors.io.AnalysisSpreadsheetBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.kurator.validation.actors.io.AnalysisSpreadsheetBuilder.java

Source

/*
 * Copyright (c) 2014 President and Fellows of Harvard College
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of Version 2 of the GNU General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package org.kurator.validation.actors.io;

import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.hssf.util.HSSFColor;
import org.apache.poi.ss.usermodel.CellStyle;
import org.kurator.akka.KuratorActor;
import org.kurator.validation.data.AnalysisSummary;

import java.io.*;
import java.util.*;

/**
 * Generates a spreadsheet from analysis results.
 *
 * @author lowery
 */
public class AnalysisSpreadsheetBuilder extends KuratorActor {
    private static final String INPUT_FILE_PROPERTY = "postprocess.inputFile";
    private static final String OUTPUT_FILE_PROPERTY = "postprocess.outputFile";
    private static final String ACTIONABLE_ITEMS_ONLY_PROPERTY = "postprocess.actionableItemsOnly";

    private static final int MAX_ROWS = 65535;

    private List<Map<String, Object>> summary = new ArrayList<Map<String, Object>>();

    private String[] actorNames = { "ScientificNameValidator", "CollectionEventOutlierFinder", "EventDateValidator",
            "GeoRefValidator", "BasisOfRecordValidator" };

    private Map<String, String> validationStateTextMappings = new HashMap<String, String>() {
        {
            put("UNABLE_DETERMINE_VALIDITY", "don't know");
            put("CURATED", "we have proposed this change");
            put("CORRECT", "no change needed; looks good to us");
            put("FILLED_IN", "no value was present, we have proposed one");
            put("UNABLE_CURATE", "there seems to be a problem, but we don't know how to solve it");
        }
    };

    private Map<String, String> recordColumnMap = new HashMap<String, String>() {
        {
            put("collectionCode", "Collection Code");
            put("institutionCode", "Institution Code");
            put("catalogNumber", "Catalog Number");
            put("id", "Id");
            put("occurrenceId", "occurrence Id");
            put("recordedBy", "Collector");
            put("eventDate", "Date Collected");
            put("verbatimEventDate", "Verbatim Date Collected");
            put("identifiedBy", "Determiner");
            put("scientificName", "Scientific Name");
            put("scientificNameAuthorship", "Scientific Name Authorship");
            put("taxonID", "Taxon Id");
            put("family", "Family");
            put("country", "Country");
            put("stateProvince", "State/Province");
            put("county", "County");
            put("locality", "Locality");
            put("decimalLatitude", "Decimal Latitude");
            put("decimalLongitude", "Decimal Longitude");
            put("georeferenceSources", "Georeference Sources");
            put("coordinateUncertaintyInMeters", "Coordinate Uncertainty In Meters");
            put("geodeticDatum", "Geodetic Datum");
            put("ownerInstitutionCode", "Owner Institution Code");
            put("startDayOfYear", "Start Day Of Year");
            put("month", "Month");
            put("day", "Day");
            put("year", "Year");
            put("basisOfRecord", "Basis of Record");
            put("modified", "Modified");
        }
    };

    private Map<String, String> actorDetailsColumnMap = new HashMap<String, String>() {
        {
            put("Actor Result", "Actor Result");
            put("Comment", "Provenance"); //JSON key is always "Comment" so we have to keep it and adjust col header here
            put("Source", "Source");
        }
    };

    private HSSFWorkbook wb;
    private Map<String, HSSFCellStyle> validationStateStyles;
    private HSSFSheet recordSheet;
    private Map<String, HSSFSheet> actorDetailsSheets;

    private int recordNum = 0;

    public boolean actionableItemsOnly;
    public String filePath;
    private File outputFile;

    @Override
    protected void onInitialize() throws Exception {
        this.wb = new HSSFWorkbook();

        if (filePath != null)
            outputFile = new File(filePath);

        initStyles();
        initSheets();
    }

    @Override
    protected void onData(Object value) throws Exception {
        if (value instanceof AnalysisSummary) {
            AnalysisSummary summary = (AnalysisSummary) value;

            Map record = summary.getRecord();

            Map markers = (HashMap) summary.getMarkers();
            addRecord(record, markers, recordNum + 1);

            HashSet<HashMap> actorDetails = summary.getDetailSet();
            addActorDetails(actorDetails, record, recordNum + 1);

            recordNum++;
        }
    }

    @Override
    protected void onEnd() throws Exception {
        // last step is to auto-size columns
        int numSheets = wb.getNumberOfSheets();
        int maxCols = actorDetailsColumnMap.size() < recordColumnMap.size() ? recordColumnMap.size()
                : actorDetailsColumnMap.size();

        for (int i = 0; i < numSheets; i++) {
            autoSizeColumns(wb.getSheetAt(i), maxCols);
        }

        if (outputFile == null) {
            outputFile = File.createTempFile("output_", ".xls");
        }
        wb.write(new FileOutputStream(outputFile));

        wb.close();

        publishArtifact("output_xls", outputFile.getAbsolutePath());
    }

    private void initFirstSheet(long count) {
        StringBuffer stringBuffer = new StringBuffer();

        try {
            BufferedReader bufferedReader = new BufferedReader(
                    new InputStreamReader(this.getClass().getResourceAsStream("/analysis.txt")));

            String line = null;

            while ((line = bufferedReader.readLine()) != null) {

                stringBuffer.append(line).append("\n");
            }

            stringBuffer.append("\nTotal record count: " + count + " occurrence records.");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        // TODO: Add list of sources to first page

        HSSFSheet sheet = wb.createSheet("Description");

        sheet.setColumnWidth(0, 18000);
        HSSFRow row = sheet.createRow(0);

        HSSFCell cell = row.createCell(0);

        CellStyle style = wb.createCellStyle();
        style.setWrapText(true);
        cell.setCellStyle(style);
        cell.setCellValue(stringBuffer.toString());
    }

    /**
     * Process a single result and add it to the summary list for post processing. If we are only dealing with
     * actionable items we check to see if a record is actionable and omit records that are not.
     *
     * @param result a single analysis result
     */
    private void processResult(HashMap result) {
        Map record = (HashMap) result.get("Record");
        boolean isActionable = checkActionable(record);

        if (!actionableItemsOnly || actionableItemsOnly && isActionable) {
            summary.add(result);
        }
    }

    /**
     * Check if a particular record is actionable or not. Actionable items are those that require further action
     * such as records that have been marked "CURATED" or "UNABLE_CURATE".
     *
     * @param record
     * @return true if actionable, false otherwise
     */
    private boolean checkActionable(Map record) {
        HashMap validationState = (HashMap) record.get("ValidationState");
        for (Object value : validationState.values()) {
            if ((((String) value).equalsIgnoreCase("CURATED")
                    || ((String) value).equalsIgnoreCase("UNABLE_CURATE"))) {
                return true;
            }
        }

        return false;
    }

    /**
     * Given a key that may occur in a map with string keys, but might have different
     * case or spacing, find a matching key in the map ignoring case and spacing.
     *
     * @param key
     * @param record map assumed to be keyed with strings
     * @return key or a case insentitive match to key found in the keys of map.
     */
    protected String normalizeKey(String key, Map record) {
        if (!record.containsKey(key)) {
            // handle case variation in terms
            Set recordKeys = record.keySet();
            Iterator i = recordKeys.iterator();
            boolean found = false;
            while (i.hasNext() && !found) {
                String recordKey = i.next().toString();
                if (key.replace(" ", "").toLowerCase().equals(recordKey.replace(" ", "").toLowerCase())) {
                    key = recordKey;
                    found = true;
                }
            }
            if (!found) {
                if (key.toLowerCase().trim().equals("source")) {
                    key = "Source";
                }
                if (key.toLowerCase().trim().equals("actor result")) {
                    key = "Actor Result";
                }
                if (key.toLowerCase().trim().equals("comment")) {
                    key = "Comment";
                }
            }
        }
        return key;
    }

    /**
     * Add the details for an actor run for a particular record to the spreadsheet.
     *
     * @param actorDetails the actor details part of the analysis result
     * @param record the record with changes applied
     * @param rowIndex the row to add actor details to
     */
    private void addActorDetails(HashSet<HashMap> actorDetails, Map record, int rowIndex) {
        for (HashMap detail : actorDetails) {
            String actorName = (String) detail.get("Actor Name");

            // get the actor details sheet for the current actor and create a new row

            HSSFSheet sheet = actorDetailsSheets.get(actorName);
            HSSFRow row = sheet.createRow(rowIndex);

            Map validationState = (HashMap) detail.get("ValidationState");

            int colIndex = 0;
            for (String key : actorDetailsColumnMap.keySet()) {
                if (!record.containsKey(key) && !validationStateTextMappings.containsKey(key)
                        && !actorDetailsColumnMap.containsKey(key)) {
                    // handle case variation in terms
                    key = this.normalizeKey(key, record);
                }

                HSSFCell cell = row.createCell(colIndex);

                // Actor details may not contain values for all the columns, only those that are relevant to that
                // particular actor (scientificName and scientificNameAuthorship for the ScientificNameValidator
                // for example). If there is no value for the current column, obtain a value from the full record.
                if (detail.containsKey(key) && detail.get(key) != null && !((String) detail.get(key)).isEmpty()) {
                    if (validationStateTextMappings.containsKey(detail.get(key))) {
                        cell.setCellValue(validationStateTextMappings.get((String) detail.get(key)));
                        cell.setCellStyle(validationStateStyles.get((String) detail.get(key)));
                    } else {
                        String value = (String) detail.get(key);
                        // TODO: Fix this upstream - remove overloading of source with initial values.
                        if (key.toLowerCase().trim().equals("source") && value != null && value.contains("|")) {
                            // Crude hack, strip out leading overloaded key:value# from before first source.
                            value = value.substring(value.indexOf('|'));
                        } else if (key.toLowerCase().trim().equals("source") && value != null
                                && !value.contains("|") && value.trim().endsWith("#")) {
                            // Crude hack, strip out the loading overloaded key:value# when no source was added
                            value = "";
                        }
                        cell.setCellValue(processDetailString(value));
                    }
                } else {
                    if (validationStateTextMappings.containsKey(detail.get(key))) {
                        cell.setCellValue(validationStateTextMappings.get((String) record.get(key)));
                        cell.setCellStyle(validationStateStyles.get((String) detail.get(key)));
                    } else {
                        cell.setCellValue((String) record.get(key));
                    }
                }

                // Lastly apply the style based on validation state
                if (validationState.containsKey(key)) {
                    String value = (String) validationState.get(key);
                    row.getCell(colIndex).setCellStyle(validationStateStyles.get(value));
                }

                colIndex++;
            }
        }
    }

    /**
     * For now we use this to preprocess the strings but this should really be done in analysis.
     *
     * @param s
     * @return
     */
    private String processDetailString(String s) {
        if (s.startsWith("CORRECT:") || s.startsWith("UNABLE_TO_CURATE:")
                || s.startsWith("UNABLE_DETERMINE_VALIDITY_OF:") || s.startsWith("FILLED_IN:")) {
            return s.substring(s.indexOf(':') + 2);
        }

        return s;
    }

    /**
     * Add the record from the analysis result to the spreadsheet.
     *
     * @param record the record with changes applied
     * @param markers contains the information about each actor run
     * @param rowIndex the row to add to
     */
    private void addRecord(Map record, Map markers, int rowIndex) {
        HSSFRow row = recordSheet.createRow(rowIndex);

        Map validationState = (HashMap) record.get("ValidationState");

        int colIndex = 0;
        for (String key : recordColumnMap.keySet()) {

            if (!record.containsKey(key)) {
                // handle case variation in terms
                key = this.normalizeKey(key, record);
            }
            // set the cell value for each column of the record
            HSSFCell cell = row.createCell(colIndex);
            cell.setCellValue((String) record.get(key));

            // apply the appropriate style (background color) based on values for validation state

            if (validationState.containsKey(key)) {
                String value = (String) validationState.get(key);
                row.getCell(colIndex).setCellStyle(validationStateStyles.get(value));
            }

            colIndex++;
        }

        // the last few columns of the record sheet contain information about each actor run

        for (String actorName : actorNames) {
            String marker = (String) markers.get(actorName);
            if (marker != null) {
                row.createCell(colIndex).setCellValue(validationStateTextMappings.get(marker));
                row.getCell(colIndex).setCellStyle(validationStateStyles.get(marker));
            }
            colIndex++;
        }
    }

    /**
     * Initialize styles to be used when generating the spreadsheet. Each background color maps to
     * a particular validation state (CORRECT, CURATED, UNABLE_CURATE, UNABLE_DETERMINE_VALIDITY, etc)
     */
    private void initStyles() {
        HSSFPalette palette = wb.getCustomPalette();

        HSSFColor red = palette.findSimilarColor(255, 145, 145);
        HSSFColor green = palette.findSimilarColor(156, 255, 153);
        HSSFColor yellow = palette.findSimilarColor(255, 248, 153);
        HSSFColor yellow4 = palette.findSimilarColor(230, 230, 76);
        HSSFColor grey = palette.findSimilarColor(204, 204, 204);
        HSSFColor sun4 = palette.findSimilarColor(204, 204, 255);

        HSSFCellStyle unableCurateCellStyle = wb.createCellStyle();
        unableCurateCellStyle.setFillForegroundColor(red.getIndex());
        unableCurateCellStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND);

        HSSFCellStyle correctCellStyle = wb.createCellStyle();
        correctCellStyle.setFillForegroundColor(green.getIndex());
        correctCellStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND);

        HSSFCellStyle curatedCellStyle = wb.createCellStyle();
        curatedCellStyle.setFillForegroundColor(yellow.getIndex());
        curatedCellStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND);

        HSSFCellStyle filledinCellStyle = wb.createCellStyle();
        filledinCellStyle.setFillForegroundColor(yellow4.getIndex());
        filledinCellStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND);

        HSSFCellStyle unableDetermineValidityCellStyle = wb.createCellStyle();
        unableDetermineValidityCellStyle.setFillForegroundColor(grey.getIndex());
        unableDetermineValidityCellStyle.setFillPattern(HSSFCellStyle.SOLID_FOREGROUND);

        validationStateStyles = new HashMap<String, HSSFCellStyle>();

        validationStateStyles.put("UNABLE_DETERMINE_VALIDITY", unableDetermineValidityCellStyle);
        validationStateStyles.put("CURATED", curatedCellStyle);
        validationStateStyles.put("CORRECT", correctCellStyle);
        validationStateStyles.put("FILLED_IN", filledinCellStyle);
        validationStateStyles.put("UNABLE_CURATE", unableCurateCellStyle);
    }

    private void initSheets() {
        recordSheet = wb.createSheet("Analysis Results");
        HSSFRow header = recordSheet.createRow(0);

        // initialize record sheets

        int columnIndex = 0;
        for (String columnName : recordColumnMap.values()) {
            header.createCell(columnIndex++).setCellValue(columnName);
        }

        for (String actorName : actorNames) {
            header.createCell(columnIndex++).setCellValue(actorName);
        }

        // initialize actor details sheets
        actorDetailsSheets = new HashMap<String, HSSFSheet>();

        for (String actorName : actorNames) {
            HSSFSheet sheet = wb.createSheet(actorName);
            HSSFRow detailHeader = sheet.createRow(0);

            int detailColumnIndex = 0;
            for (String columnName : actorDetailsColumnMap.values()) {
                detailHeader.createCell(detailColumnIndex++).setCellValue(columnName);
            }

            actorDetailsSheets.put(actorName, sheet);
        }
    }

    /**
     * Helper method for saving the spreadsheet to a file.
     *
     * @param outFile the path to the output file
     */
    private void save(String outFile) {
        FileOutputStream out = null;
        try {
            out = new FileOutputStream(new File(outFile));
            wb.write(out);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
            } catch (IOException e) {
            }
        }
    }

    private void autoSizeColumns(HSSFSheet sheet, int cols) {
        for (int i = 0; i < cols; i++) {
            sheet.autoSizeColumn(i);
        }
    }
}