edu.harvard.med.screensaver.io.screenresults.ScreenResultParser.java Source code

Java tutorial

Introduction

Here is the source code for edu.harvard.med.screensaver.io.screenresults.ScreenResultParser.java

Source

// $HeadURL:
// svn+ssh://ant4@orchestra.med.harvard.edu/svn/iccb/screensaver/trunk/src/edu/harvard/med/screensaver/io/screenresults/ScreenResultParser.java
// $
// $Id$
//
// Copyright  2006, 2010, 2011, 2012 by the President and Fellows of Harvard College.
//
// Screensaver is an open-source project developed by the ICCB-L and NSRB labs
// at Harvard Medical School. This software is distributed under the terms of
// the GNU General Public License.

package edu.harvard.med.screensaver.io.screenresults;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.base.Predicates;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.commons.lang.math.IntRange;
import org.apache.log4j.Logger;

import edu.harvard.med.screensaver.db.AbstractDAO;
import edu.harvard.med.screensaver.db.GenericEntityDAO;
import edu.harvard.med.screensaver.db.LibrariesDAO;
import edu.harvard.med.screensaver.db.ScreenResultsDAO;
import edu.harvard.med.screensaver.io.libraries.ExtantLibraryException;
import edu.harvard.med.screensaver.io.workbook2.Cell;
import edu.harvard.med.screensaver.io.workbook2.CellValueParser;
import edu.harvard.med.screensaver.io.workbook2.CellVocabularyParser;
import edu.harvard.med.screensaver.io.workbook2.Row;
import edu.harvard.med.screensaver.io.workbook2.WellNameParser;
import edu.harvard.med.screensaver.io.workbook2.Workbook;
import edu.harvard.med.screensaver.io.workbook2.WorkbookParseError;
import edu.harvard.med.screensaver.io.workbook2.Worksheet;
import edu.harvard.med.screensaver.model.DataModelViolationException;
import edu.harvard.med.screensaver.model.libraries.Library;
import edu.harvard.med.screensaver.model.libraries.Well;
import edu.harvard.med.screensaver.model.libraries.WellKey;
import edu.harvard.med.screensaver.model.screenresults.AssayWell;
import edu.harvard.med.screensaver.model.screenresults.AssayWellControlType;
import edu.harvard.med.screensaver.model.screenresults.ConfirmedPositiveValue;
import edu.harvard.med.screensaver.model.screenresults.DataColumn;
import edu.harvard.med.screensaver.model.screenresults.DataType;
import edu.harvard.med.screensaver.model.screenresults.PartitionedValue;
import edu.harvard.med.screensaver.model.screenresults.ResultValue;
import edu.harvard.med.screensaver.model.screenresults.ScreenResult;
import edu.harvard.med.screensaver.model.screens.AssayReadoutType;
import edu.harvard.med.screensaver.model.screens.Screen;
import edu.harvard.med.screensaver.util.AlphabeticCounter;
import edu.harvard.med.screensaver.util.DevelopmentException;
import edu.harvard.med.screensaver.util.StringUtils;

/**
 * Parses data from a workbook files (a.k.a. Excel spreadsheets) necessary for
 * instantiating a
 * {@link edu.harvard.med.screensaver.model.screenresults.ScreenResult}. A
 * {@link ScreenResult} is comprised of {@link DataColumn}s that in turn contain
 * {@link ResultValue}s. By convention, each worksheet contains the raw data for
 * a single plate, but the parser is indifferent to how data may be arranged
 * across worksheets.
 * <p>
 * The "Data Columns" worksheet is used to create
 * {@link DataColumn} objects,
 * while the raw data is used to instantiate each of the {@link DataColumn}s'
 * {@link ResultValue}
 * objects. Together, these objects are used instantiate a {@link ScreenResult}
 * object, which is the returned result of the {@link #parse} method.
 * <p>
 * The class attempts to parse the file(s) as fully as possible, carrying on in
 * the face of errors, in order to catch as many errors as possible, as this
 * will aid the manual effort of correcting the files' format and content
 * between import attempts. The {@link #getErrors()} method will
 * the errors messages encountered during parsing.
 * <p>
 * Each call to {@link #parse} will clear the errors accumulated from the
 * previous call, and so the result of calling {@link #getErrors()} will change
 * after each call to {@link #parse}.
 * 
 * @author <a mailto="andrew_tolopko@hms.harvard.edu">Andrew Tolopko</a>
 * @author <a mailto="john_sullivan@hms.harvard.edu">John Sullivan</a>
 */
public class ScreenResultParser implements ScreenResultWorkbookSpecification {
    private static final Logger log = Logger.getLogger(ScreenResultParser.class);

    private static final String NO_SCREEN_ID_FOUND_ERROR = "Screen ID not found for row: ";
    private static final String DATA_COLUMNS_SHEET_NOT_FOUND_ERROR = "\"Data Columns\" sheet not found";
    private static final String UNKNOWN_ERROR = "unknown error";
    private static final String NO_DATA_SHEETS_FOUND_ERROR = "no data worksheets were found; no result data was imported";
    private static final String NO_SUCH_WELL = "library well does not exist";
    private static final String NO_SUCH_LIBRARY_WITH_PLATE = "no library with given plate number";
    private static final String ASSAY_WELL_TYPE_INCONSISTENCY = "assay well type cannot be changed";

    private static SortedMap<String, AssayReadoutType> assayReadoutTypeMap = new TreeMap<String, AssayReadoutType>();
    private static SortedMap<String, DataType> dataTypeMap = new TreeMap<String, DataType>();
    private static SortedMap<String, Boolean> primaryOrFollowUpMap = new TreeMap<String, Boolean>();
    private static SortedMap<String, Boolean> booleanMap = new TreeMap<String, Boolean>();
    private static SortedMap<String, PartitionedValue> partitionedValueMap = new TreeMap<String, PartitionedValue>();
    private static SortedMap<String, ConfirmedPositiveValue> confirmedPositiveValueMap = new TreeMap<String, ConfirmedPositiveValue>();
    private static SortedMap<String, AssayWellControlType> assayWellControlTypeMap = new TreeMap<String, AssayWellControlType>();
    static {
        for (AssayReadoutType assayReadoutType : AssayReadoutType.values()) {
            assayReadoutTypeMap.put(assayReadoutType.getValue(), assayReadoutType);
        }

        for (DataType dataType : DataType.values()) {
            dataTypeMap.put(dataType.getValue(), dataType);
        }

        primaryOrFollowUpMap.put("", false);
        primaryOrFollowUpMap.put(PRIMARY_VALUE, false);
        primaryOrFollowUpMap.put(FOLLOWUP_VALUE, true);

        booleanMap.put("", false);
        booleanMap.put("false", false);
        booleanMap.put("no", false);
        booleanMap.put("n", false);
        booleanMap.put("0", false);
        booleanMap.put("true", true);
        booleanMap.put("yes", true);
        booleanMap.put("y", true);
        booleanMap.put("1", true);

        for (PartitionedValue pv : PartitionedValue.values()) {
            partitionedValueMap.put(pv.getValue().toLowerCase(), pv);
            partitionedValueMap.put(pv.getValue().toUpperCase(), pv);
        }

        for (ConfirmedPositiveValue pv : ConfirmedPositiveValue.values()) {
            confirmedPositiveValueMap.put(pv.getValue().toLowerCase(), pv);
            confirmedPositiveValueMap.put(pv.getValue().toUpperCase(), pv);
        }

        for (AssayWellControlType awct : AssayWellControlType.values()) {
            assayWellControlTypeMap.put(awct.getAbbreviation(), awct);
        }
    }

    private LibrariesDAO _librariesDao;
    private ScreenResultsDAO _screenResultsDao;
    private edu.harvard.med.screensaver.db.GenericEntityDAO _genericEntityDao;

    /**
     * The ScreenResult object to be populated with data parsed from the spreadsheet.
     */
    private ScreenResult _screenResult;
    private Workbook _workbook;

    private ColumnLabelsParser _columnsDerivedFromParser;
    private ExcludeParser _excludeParser;
    private CellVocabularyParser<AssayReadoutType> _assayReadoutTypeParser;
    private CellVocabularyParser<DataType> _dataTypeParser;
    private CellVocabularyParser<Boolean> _rawOrDerivedParser;
    private CellVocabularyParser<Boolean> _primaryOrFollowUpParser;
    private CellVocabularyParser<Boolean> _booleanParser;
    private CellVocabularyParser<PartitionedValue> _partitionedValueParser;
    private CellVocabularyParser<ConfirmedPositiveValue> _confirmedPositiveValueParser;
    private CellVocabularyParser<AssayWellControlType> _assayWellControlTypeParser;
    private WellNameParser _wellNameParser;

    private SortedMap<String, DataColumn> _worksheetColumnLabel2DataColumnObjectMap;
    /**
     * @motivation runtime detection of duplicate wells in the input stream
     */
    private Set<String> parsedWellKeys = Sets.newHashSet();
    private Map<Integer, Integer> _plateNumbersLoadedWithMaxReplicates = Maps.newHashMap();
    private Map<Integer, Integer> _dataColumnIndex2WorksheetColumnIndex;
    private boolean _ignoreDuplicateErrors = false;
    /**
     * The library that was associated with the plate that was last accessed.
     * @motivation optimization for findLibraryWithPlate(); reduce db I/O
     */
    private Library _lastLibrary;

    public ScreenResultParser(LibrariesDAO librariesDao, ScreenResultsDAO screenResultsDao,
            GenericEntityDAO genericEntityDao) {
        _librariesDao = librariesDao;
        _screenResultsDao = screenResultsDao;
        _genericEntityDao = genericEntityDao;
    }

    public void setIgnoreDuplicateErrors(boolean value) {
        _ignoreDuplicateErrors = value;
    }

    /**
     * Main function.
     * Parses the specified workbook file that contains Screen Result data in the
     * <a
     * href="https://wiki.med.harvard.edu/ICCBL/NewScreenResultFileFormat">"new"
     * format</a>. Errors encountered during parsing are stored with this object
     * until a parse() method is called again, and these errors can be retrieved
     * via {@link #getErrors}. The returned <code>ScreenResult</code> may only
     * be partially populated if errors are encountered, so always call
     * getErrors() to determine parsing success.
     *
     * @param screen the parent Screen of the Screen Result being parsed
     * @param workbook the workbook to be parsed
     * @param plateNumberRange the range of plate numbers to be parsed, allowing for only a subset
     *          of the data to be imported. This may be required for resource
     *          utilization purposes, where the ScreenResult must be imported over
     *          multiple passes. If null, well data for all plates will be imported.
     * @return a ScreenResult object containing the data parsed from the workbook
     *         file; <code>null</code> if a fatal error occurs (e.g. file not
     *         found)
     * @see #getErrors()
     */
    public ScreenResult parse(Screen screen, Workbook workbook, IntRange plateNumberRange,
            boolean incrementalFlush) {
        _workbook = workbook;
        return doParse(screen, plateNumberRange, incrementalFlush);
    }

    /**
     * Note: incrementalFlush is turned off by default
     */
    public ScreenResult parse(Screen screen, File workbookFile) throws FileNotFoundException {
        return parse(screen, workbookFile, null, false);
    }

    public ScreenResult parse(Screen screen, File workbookFile, boolean incrementalFlush)
            throws FileNotFoundException {
        return parse(screen, workbookFile, null, incrementalFlush);
    }

    public ScreenResult parse(Screen screen, File workbookFile, IntRange plateNumberRange, boolean incrementalFlush)
            throws FileNotFoundException {
        return parse(screen, new Workbook(workbookFile), plateNumberRange, incrementalFlush);
    }

    private ScreenResult doParse(Screen screen, IntRange plateNumberRange, boolean incrementalFlush) {
        _screenResult = screen.getScreenResult();
        _lastLibrary = null;
        _assayReadoutTypeParser = new CellVocabularyParser<AssayReadoutType>(assayReadoutTypeMap);
        _worksheetColumnLabel2DataColumnObjectMap = new TreeMap<String, DataColumn>();
        _columnsDerivedFromParser = new ColumnLabelsParser(_worksheetColumnLabel2DataColumnObjectMap);
        _excludeParser = new ExcludeParser(_worksheetColumnLabel2DataColumnObjectMap);
        _dataTypeParser = new CellVocabularyParser<DataType>(dataTypeMap);
        _primaryOrFollowUpParser = new CellVocabularyParser<Boolean>(primaryOrFollowUpMap, Boolean.FALSE);
        _booleanParser = new CellVocabularyParser<Boolean>(booleanMap, Boolean.FALSE);
        _partitionedValueParser = new CellVocabularyParser<PartitionedValue>(partitionedValueMap,
                PartitionedValue.NOT_POSITIVE);
        _confirmedPositiveValueParser = new CellVocabularyParser<ConfirmedPositiveValue>(confirmedPositiveValueMap,
                ConfirmedPositiveValue.INCONCLUSIVE);
        _assayWellControlTypeParser = new CellVocabularyParser<AssayWellControlType>(assayWellControlTypeMap);
        _wellNameParser = new WellNameParser();

        try {
            log.info("parsing " + _workbook.getName());
            if (_screenResult == null) {
                _screenResult = screen.createScreenResult();
                log.debug("created screen result");
                if (!parseDataColumnDefinitions(_screenResult, _workbook)) {
                    log.info("errors found in data column definitions, will not attempt to parse data sheets");
                    return _screenResult;
                }
            }

            initializeDataColumnLocations(_screenResult, _workbook);
            log.debug("parsing data sheets");
            parseData(_workbook, _screenResult, plateNumberRange, incrementalFlush);
        } catch (UnrecoverableScreenResultParseException e) {
            _workbook.addError(
                    "serious parse error encountered (could not continue further parsing): " + e.getMessage());
        } catch (Exception e) {
            e.printStackTrace();
            String errorMsg = UNKNOWN_ERROR + " of type : " + e.getClass() + ": " + e.getMessage();
            _workbook.addError(errorMsg);
        }
        return _screenResult;
    }

    public boolean getHasErrors() {
        return _workbook.getHasErrors();
    }

    /**
     * Return all errors that were detected during parsing. This class attempts to
     * parse as much of the workbook as possible, continuing on after finding an
     * error. The hope is that multiple errors will help a user/administrator
     * correct a workbook's errors in a batch fashion, rather than in a piecemeal
     * fashion.
     *
     * @return a <code>List&lt;String&gt;</code> of all errors generated during
     *         parsing
     */
    public List<WorkbookParseError> getErrors() {
        return _workbook.getErrors();
    }

    public Map<Integer, Integer> getPlateNumbersLoadedWithMaxReplicates() {
        return _plateNumbersLoadedWithMaxReplicates;
    }

    public ScreenResult getParsedScreenResult() {
        return _screenResult;
    }

    /**
     * Finds the total number of data columns.
     * TODO: this does not account for non-contiguous blocks of empty cells
     * @param dataColumnsSheet
     * @return highest cell column index plus 1
     * @throws UnrecoverableScreenResultParseException
     */
    private int findDataColumnCount(Worksheet dataColumnsSheet) {
        dataColumnsSheet.getColumns();
        int rows = dataColumnsSheet.getRows();

        if (rows == 0) {
            return 0;
        }

        int n = 0;
        for (Cell cell : dataColumnsSheet.getRow(0)) {
            if (cell.isEmpty())
                break;
            n++;
        }
        return n;
    }

    private int getDataColumn(int dataColumn) {
        return _dataColumnIndex2WorksheetColumnIndex.get(dataColumn);
    }

    /**
     * Parse the worksheet containing the ScreenResult data columns.
     * This method returns error results by add them to the {@link Workbook}. 
     * Therefore check the workbook after running for errors.
     * @param workbook
     * @throws UnrecoverableScreenResultParseException 
     * 
     */
    private boolean parseDataColumnDefinitions(ScreenResult screenResult, Workbook workbook)
            throws UnrecoverableScreenResultParseException {
        log.debug("parse data columns sheet");

        Worksheet dataColumnsSheet = _workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME)
                .forOrigin(DATA_COLUMNS_SHEET__FIRST_DATA_COLUMN__WORKSHEET_COLUMN_INDEX, 0);
        if (dataColumnsSheet == null) {
            _workbook.addError(DATA_COLUMNS_SHEET_NOT_FOUND_ERROR);
            return false;
        }
        Map<DataColumnProperty, Row> dataColumnPropertyRows = parseDataColumnPropertyNames(dataColumnsSheet);
        int dataColumnCount = findDataColumnCount(dataColumnsSheet);
        for (int iDataColumn = 0; iDataColumn < dataColumnCount; ++iDataColumn) {
            if (!!!dataColumnPropertyRows.containsKey(DataColumnProperty.NAME)) {
                throw new UnrecoverableScreenResultParseException(
                        DataColumnProperty.NAME + " data column property is required");
            }
            DataColumn dataColumn = screenResult.createDataColumn(
                    dataColumnPropertyRows.get(DataColumnProperty.NAME).getCell(iDataColumn, true).getString());
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.DATA_TYPE)) {
                Cell cell = dataColumnPropertyRows.get(DataColumnProperty.DATA_TYPE).getCell(iDataColumn, true);
                DataType dataType = _dataTypeParser.parse(cell);
                if (dataType != null) {
                    switch (dataType) {
                    case NUMERIC: {
                        Integer decimalPlaces = null;
                        if (dataColumnPropertyRows.containsKey(DataColumnProperty.DECIMAL_PLACES)) {
                            Cell cell2 = dataColumnPropertyRows.get(DataColumnProperty.DECIMAL_PLACES)
                                    .getCell(iDataColumn);
                            decimalPlaces = cell2.getInteger();
                            if (decimalPlaces != null && decimalPlaces < 0) {
                                cell2.addError("illegal value");
                            }
                        }
                        dataColumn.makeNumeric(decimalPlaces);
                        break;
                    }
                    case TEXT:
                        dataColumn.makeTextual();
                        break;
                    case POSITIVE_INDICATOR_BOOLEAN:
                        dataColumn.makeBooleanPositiveIndicator();
                        break;
                    case POSITIVE_INDICATOR_PARTITION:
                        dataColumn.makePartitionPositiveIndicator();
                        break;
                    case CONFIRMED_POSITIVE_INDICATOR:
                        dataColumn.makeConfirmedPositiveIndicator();
                        break;
                    default:
                        throw new DevelopmentException("unhandled data type " + dataType);
                    }
                }
            } else {
                dataColumn.makeNumeric(null);
            }

            if (dataColumnPropertyRows.containsKey(DataColumnProperty.REPLICATE)) {
                dataColumn.forReplicate(
                        dataColumnPropertyRows.get(DataColumnProperty.REPLICATE).getCell(iDataColumn).getInteger());
            }
            String howDerived = null;
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.HOW_DERIVED)) {
                howDerived = dataColumnPropertyRows.get(DataColumnProperty.HOW_DERIVED).getCell(iDataColumn)
                        .getString();
            }
            Set<DataColumn> columnsDerivedFrom = Collections.emptySet();
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.COLUMNS_DERIVED_FROM)) {
                columnsDerivedFrom = Sets.newHashSet(Iterables.filter(
                        _columnsDerivedFromParser.parseList(dataColumnPropertyRows
                                .get(DataColumnProperty.COLUMNS_DERIVED_FROM).getCell(iDataColumn)),
                        Predicates.notNull()));
            }
            if (!!!(StringUtils.isEmpty(howDerived) && columnsDerivedFrom.isEmpty())) {
                dataColumn.makeDerived(howDerived, columnsDerivedFrom);
            }
            if (!!!dataColumn.isDerived() && !!!dataColumn.isPositiveIndicator()) {
                if (dataColumnPropertyRows.containsKey(DataColumnProperty.ASSAY_READOUT_TYPE)) {
                    dataColumn.setAssayReadoutType(_assayReadoutTypeParser.parse(dataColumnPropertyRows
                            .get(DataColumnProperty.ASSAY_READOUT_TYPE).getCell(iDataColumn, false)));
                }
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.PRIMARY_OR_FOLLOWUP)) {
                dataColumn.setFollowUpData(_primaryOrFollowUpParser.parse(
                        dataColumnPropertyRows.get(DataColumnProperty.PRIMARY_OR_FOLLOWUP).getCell(iDataColumn)));
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.ASSAY_PHENOTYPE)) {
                dataColumn.forPhenotype(dataColumnPropertyRows.get(DataColumnProperty.ASSAY_PHENOTYPE)
                        .getCell(iDataColumn).getString());
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.DESCRIPTION)) {
                dataColumn.setDescription(dataColumnPropertyRows.get(DataColumnProperty.DESCRIPTION)
                        .getCell(iDataColumn).getString());
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.COMMENTS)) {
                dataColumn.setComments(
                        dataColumnPropertyRows.get(DataColumnProperty.COMMENTS).getCell(iDataColumn).getString());
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.TIME_POINT)) {
                dataColumn.forTimePoint(
                        dataColumnPropertyRows.get(DataColumnProperty.TIME_POINT).getCell(iDataColumn).getString());
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.TIME_POINT_ORDINAL)) {
                dataColumn.forTimePointOrdinal(dataColumnPropertyRows.get(DataColumnProperty.TIME_POINT_ORDINAL)
                        .getCell(iDataColumn).getInteger());
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.CHANNEL)) {
                dataColumn.forChannel(
                        dataColumnPropertyRows.get(DataColumnProperty.CHANNEL).getCell(iDataColumn).getInteger());
            }
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.ZDEPTH_ORDINAL)) {
                dataColumn.forZdepthOrdinal(dataColumnPropertyRows.get(DataColumnProperty.ZDEPTH_ORDINAL)
                        .getCell(iDataColumn).getInteger());
            }
            //      if (dataColumnPropertyRows.containsKey(DataColumnProperty.CELL_LINE)) {
            //        dataColumn.forCellLine(dataColumnPropertyRows.get(DataColumnProperty.CELL_LINE).getCell(iDataColumn).getString());
            //      }
            // note: we do this last so that _columnsDerivedFromParser does not allow the current column to be considered a valid "derived from" value
            _worksheetColumnLabel2DataColumnObjectMap.put(dataColumnPropertyRows
                    .get(DataColumnProperty.COLUMN_IN_DATA_WORKSHEET).getCell(iDataColumn, true).getAsString(),
                    dataColumn);
        }
        return !!!_workbook.getHasErrors();
    }

    private Map<DataColumnProperty, Row> parseDataColumnPropertyNames(Worksheet dataColumnsSheet) {
        Map<DataColumnProperty, Row> result = Maps.newHashMap();
        Iterator<Row> dataRows = dataColumnsSheet.forOrigin(1, 0).iterator();
        for (Row row : dataColumnsSheet.forOrigin(0, 0)) {
            if (row.isEmpty()) {
                break;
            }

            String dataColumnPropertyLabel = row.getCell(0).getString().trim();
            DataColumnProperty dataColumnProperty = DataColumnProperty.fromDisplayText(dataColumnPropertyLabel);
            Row dataOnlyRow = dataRows.next();
            if (dataColumnProperty != null) {
                result.put(dataColumnProperty, dataOnlyRow);
            } else {
                row.getCell(0).addError("unknown data column property: " + dataColumnPropertyLabel);
            }
        }
        return result;
    }

    private void initializeDataColumnLocations(ScreenResult screenResult, Workbook workbook)
            throws UnrecoverableScreenResultParseException {
        Worksheet dataColumnsSheet = workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME)
                .forOrigin(DATA_COLUMNS_SHEET__FIRST_DATA_COLUMN__WORKSHEET_COLUMN_INDEX, 0);
        if (dataColumnsSheet == null) {
            throw new UnrecoverableScreenResultParseException(DATA_COLUMNS_SHEET_NOT_FOUND_ERROR);
        }

        _dataColumnIndex2WorksheetColumnIndex = Maps.newHashMap();

        int dataColumnCount = findDataColumnCount(dataColumnsSheet);
        Map<DataColumnProperty, Row> dataColumnProperties = parseDataColumnPropertyNames(dataColumnsSheet);
        for (int iDataColumn = 0; iDataColumn < dataColumnCount; ++iDataColumn) {
            Cell cell = dataColumnProperties.get(DataColumnProperty.COLUMN_IN_DATA_WORKSHEET).getCell(iDataColumn,
                    true);
            String forColumnInDataWorksheet = cell.getString().trim();
            try {
                if (forColumnInDataWorksheet != null) {
                    _dataColumnIndex2WorksheetColumnIndex.put(iDataColumn,
                            AlphabeticCounter.toIndex(forColumnInDataWorksheet));
                    DataColumn dataColumn = screenResult.getDataColumnsList().get(iDataColumn);
                    _worksheetColumnLabel2DataColumnObjectMap.put(forColumnInDataWorksheet, dataColumn);
                }
            } catch (IllegalArgumentException e) {
                cell.addError(e.getMessage());
            }
        }
    }

    /**
     * Parse the workbook containing the ScreenResult data.
     *
     * @param workbook the workbook containing some or all of the raw data for a
     *          ScreenResult
     * @throws ExtantLibraryException if an existing Well entity cannot be found
     *           in the database
     * @throws IOException
     * @throws UnrecoverableScreenResultParseException
     */
    private void parseData(Workbook workbook, ScreenResult screenResult, IntRange plateNumberRange,
            boolean incrementalFlush)
            throws ExtantLibraryException, IOException, UnrecoverableScreenResultParseException {
        log.debug("incrementalFlush:" + incrementalFlush);
        long startTime = System.currentTimeMillis();
        long loopTime = startTime;

        int wellsWithDataLoaded = 0;
        int dataSheetsParsed = 0;
        int totalSheets = workbook.getWorkbook().getNumberOfSheets();
        int firstDataSheetIndex = workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).getSheetIndex() + 1;
        int totalDataSheets = Math.max(0, totalSheets - firstDataSheetIndex);
        plateNumberRange = plateNumberRange == null ? new IntRange(Integer.MIN_VALUE, Integer.MAX_VALUE)
                : plateNumberRange;

        // Note: we do this to make sure that the DataColumn's are persisted before we persist and clear the RV's
        if (screenResult.getEntityId() == null) {
            _genericEntityDao.persistEntity(screenResult);
        } else {
            _genericEntityDao.saveOrUpdateEntity(screenResult);
        }
        _genericEntityDao.flush();

        for (int iSheet = firstDataSheetIndex; iSheet < totalSheets; ++iSheet) {
            String sheetName = workbook.getWorkbook().getSheet(iSheet).getName();
            log.info("parsing sheet " + (dataSheetsParsed + 1) + " of " + totalDataSheets + ", " + sheetName);
            Worksheet worksheet = workbook.getWorksheet(iSheet).forOrigin(0, DATA_SHEET__FIRST_DATA_ROW_INDEX);

            for (Row row : worksheet) {
                // bring in the old findNextRow() logic
                if (row.getColumns() > 0 && !row.getCell(0).isEmpty()
                        && row.getCell(0).getAsString().trim().length() > 0) {
                    Integer plateNumber = row.getCell(WellInfoColumn.PLATE.ordinal(), true).getInteger();
                    Cell wellNameCell = row.getCell(WellInfoColumn.WELL_NAME.ordinal());
                    String wellName = _wellNameParser.parse(wellNameCell);
                    if (!wellName.equals("")) {
                        WellKey wellKey = new WellKey(plateNumber, wellName);
                        if (!plateNumberRange.containsInteger(wellKey.getPlateNumber())) {
                            if (log.isDebugEnabled()) {
                                log.debug(
                                        "Skipping, excluded range: " + plateNumberRange + ", row: " + row.getRow());
                            }
                        } else {
                            boolean duplicate = !parsedWellKeys.add(wellKey.getKey());
                            if (duplicate) {
                                if (!_ignoreDuplicateErrors) {
                                    wellNameCell.addError("duplicate well: " + wellKey);
                                } else {
                                    log.debug(
                                            "duplicate well: " + wellKey + ", duplicate found at: " + wellNameCell);
                                }
                            } else {
                                if (findLibraryWithPlate(wellKey.getPlateNumber()) == null) {
                                    wellNameCell.addError(NO_SUCH_LIBRARY_WITH_PLATE);
                                } else {
                                    Well well = _librariesDao.findWell(wellKey);
                                    if (well == null) {
                                        wellNameCell.addError(NO_SUCH_WELL + ": " + wellKey);
                                    } else if (findAssayWell(well) == null) {
                                        readResultValues(screenResult, row, well, incrementalFlush);
                                        ++wellsWithDataLoaded;
                                        if (incrementalFlush
                                                && wellsWithDataLoaded % AbstractDAO.ROWS_TO_CACHE == 0) {
                                            saveResultValuesAndFlush(screenResult, incrementalFlush);
                                            if (log.isInfoEnabled() && wellsWithDataLoaded
                                                    % (AbstractDAO.ROWS_TO_CACHE * 100) == 0) {
                                                long time = System.currentTimeMillis();
                                                long cumulativeTime = time - startTime;
                                                log.info("wellsWithDataLoaded: " + wellsWithDataLoaded
                                                        + ", cumulative time: "
                                                        + (double) cumulativeTime / (double) 60000
                                                        + " min, avg row time: "
                                                        + (double) cumulativeTime / (double) wellsWithDataLoaded
                                                        + ", loopTime: " + (time - loopTime));
                                                loopTime = time;
                                            }
                                        } // incremental
                                    }
                                }
                            }
                        }
                    }
                }
            } // for row
            ++dataSheetsParsed;
            if (wellsWithDataLoaded > 0) {
                saveResultValuesAndFlush(screenResult, incrementalFlush);
                log.info("Sheet: " + sheetName + " done, save, count: " + wellsWithDataLoaded);
                long time = System.currentTimeMillis();
                long cumulativeTime = time - startTime;
                log.info("wellsWithDataLoaded: " + wellsWithDataLoaded + ", cumulative time: "
                        + (double) cumulativeTime / (double) 60000 + " min, avg row time: "
                        + (double) cumulativeTime / (double) wellsWithDataLoaded);
            }
        }

        if (dataSheetsParsed == 0) {
            _workbook.addError(NO_DATA_SHEETS_FOUND_ERROR);
        } else {
            log.info("done parsing " + dataSheetsParsed + " data sheet(s) " + workbook.getName());
            log.info("loaded data for " + wellsWithDataLoaded + " well(s) ");
        }
    }

    private void saveResultValuesAndFlush(ScreenResult screenResult, boolean incrementalFlush) {
        log.debug("incrementally save the screen result and clear values");
        for (DataColumn dataColumn : screenResult.getDataColumns()) {
            _genericEntityDao.saveOrUpdateEntity(dataColumn);
            //TODO: this should not be required, but the writes to the DB were missing some of the RV's and this fixed it -sde4
            for (ResultValue rv : dataColumn.getResultValues()) {
                _genericEntityDao.saveOrUpdateEntity(rv);
            }
        }
        if (incrementalFlush) {
            for (DataColumn dataColumn : screenResult.getDataColumns()) {
                dataColumn.clearResultValues();
            }
            for (AssayWell assayWell : screenResult.getAssayWells()) {
                _genericEntityDao.saveOrUpdateEntity(assayWell);
            }
            screenResult.getAssayWells().clear();
            _genericEntityDao.flush();
            _genericEntityDao.clear();
        }
    }

    private void readResultValues(ScreenResult screenResult, Row row, Well well, boolean incrementalFlush) {
        AssayWellControlType assayWellControlType = _assayWellControlTypeParser
                .parse(row.getCell(WellInfoColumn.ASSAY_WELL_TYPE.ordinal()));
        try {
            AssayWell assayWell = createAssayWell(well, assayWellControlType);
            List<DataColumn> wellExcludes = _excludeParser.parseList(row.getCell(WellInfoColumn.EXCLUDE.ordinal()));
            int iDataColumn = 0;
            int maxReplicateOrdinal = 1;
            for (DataColumn dataColumn : screenResult.getDataColumns()) {
                Cell cell = row.getCell(getDataColumn(iDataColumn));
                boolean isExclude = (wellExcludes != null && wellExcludes.contains(dataColumn));

                ResultValue newResultValue = null;
                if (dataColumn.isBooleanPositiveIndicator()) {
                    newResultValue = dataColumn.createBooleanPositiveResultValue(assayWell,
                            cell.isBoolean() ? cell.getBoolean() : _booleanParser.parse(cell), isExclude);
                } else if (dataColumn.isPartitionPositiveIndicator()) {
                    newResultValue = dataColumn.createPartitionedPositiveResultValue(assayWell,
                            _partitionedValueParser.parse(cell), isExclude);
                } else if (dataColumn.isConfirmedPositiveIndicator()) {
                    newResultValue = dataColumn.createConfirmedPositiveResultValue(assayWell,
                            _confirmedPositiveValueParser.parse(cell), isExclude);
                } else if (dataColumn.isNumeric()) {
                    newResultValue = dataColumn.createResultValue(assayWell, cell.getDouble(), isExclude);
                } else {
                    newResultValue = dataColumn.createResultValue(assayWell, cell.getString(), isExclude);
                }

                // update the maxReplicateOrdinal to track the actual number of
                // replicates that have data loaded for a particular plate
                // (this allows us to later calculate which how many replicates
                // actually have associated data, in case some of the "extra" screened
                // replicates were ignored by the screener)
                if (!dataColumn.isDerived() && !newResultValue.isNull()) {
                    if (dataColumn.getReplicateOrdinal() != null) {
                        maxReplicateOrdinal = Math.max(maxReplicateOrdinal, dataColumn.getReplicateOrdinal());
                    }
                }
                if (newResultValue == null) {
                    cell.addError("duplicate well");
                }

                ++iDataColumn;
            }

            recordAssayPlatesDataLoaded(well, maxReplicateOrdinal);

            if (incrementalFlush) {
                // [#2119] Optimize ScreenResultParser for scalability:
                // - in memory RV's must be reloaded as needed
                assayWell.getLibraryWell().getResultValues().clear();
            }
        } catch (DataModelViolationException e) {
            row.getCell(WellInfoColumn.ASSAY_WELL_TYPE.ordinal()).addError(e.getMessage());
        }
    }

    private AssayWell createAssayWell(Well well, AssayWellControlType assayWellControlType)
            throws DataModelViolationException {
        AssayWell assayWell = _screenResult.createAssayWell(well);
        assayWell.setAssayWellControlType(assayWellControlType);
        return assayWell;
    }

    private AssayWell findAssayWell(Well well) {
        AssayWell assayWell = _screenResultsDao.findAssayWell(_screenResult, well.getWellKey());
        //    if (assayWell != null) {
        //      if (assayWell.getAssayWellControlType() != assayWellControlType) {
        //        _workbook.addError(ASSAY_WELL_TYPE_INCONSISTENCY + ": " + well);
        //        return null;
        //      }
        //    }
        return assayWell;
    }

    private void recordAssayPlatesDataLoaded(Well well, int replicateCount) {
        Integer maxReplicateCount = _plateNumbersLoadedWithMaxReplicates.get(well.getPlateNumber());
        if (maxReplicateCount == null || maxReplicateCount.compareTo(replicateCount) < 0) {
            _plateNumbersLoadedWithMaxReplicates.put(well.getPlateNumber(), replicateCount);
        }
    }

    /**
     * @motivation database I/O optimization
     */
    private Library findLibraryWithPlate(Integer plateNumber) {
        if (_lastLibrary == null || !_lastLibrary.containsPlate(plateNumber)) {
            _lastLibrary = _librariesDao.findLibraryWithPlate(plateNumber);
        }
        return _lastLibrary;
    }

    private static class ParsedScreenInfo {
        private Integer _screenId;

        public Integer getScreenId() {
            return _screenId;
        }

        public void setScreenId(Integer screenId) {
            _screenId = screenId;
        }
    }

    public class ColumnLabelsParser implements CellValueParser<DataColumn> {
        protected Map<String, DataColumn> _columnLabel2ColMap;
        private Pattern columnIdPattern = Pattern.compile("[A-Z]+");

        public ColumnLabelsParser(Map<String, DataColumn> columnLabel2ColMap) {
            _columnLabel2ColMap = columnLabel2ColMap;
        }

        public DataColumn parse(Cell cell) {
            throw new UnsupportedOperationException();
        }

        public List<DataColumn> parseList(Cell cell) {
            String textMultiValue = cell.getString();
            List<DataColumn> result = new ArrayList<DataColumn>();

            if (textMultiValue == null || textMultiValue.trim().length() == 0) {
                return result;
            }

            String[] textValues = textMultiValue.split(",");
            for (int i = 0; i < textValues.length; i++) {
                String text = textValues[i].trim();
                DataColumn dataColumn = doParseSingleValue(text, cell);
                if (dataColumn != null) {
                    result.add(dataColumn);
                } else {
                    cell.addError("invalid Data Column worksheet column label '" + text + "' (expected one of "
                            + _columnLabel2ColMap.keySet() + ")");
                }
            }
            return result;
        }

        protected DataColumn doParseSingleValue(String value, Cell cell) {
            Matcher matcher = columnIdPattern.matcher(value);
            if (!matcher.matches()) {
                return null;
            }
            String columnLabel = matcher.group(0);
            return _columnLabel2ColMap.get(columnLabel);
        }
    }

    private class ExcludeParser extends ColumnLabelsParser {
        public ExcludeParser(Map<String, DataColumn> columnLabel2ColMap) {
            super(columnLabel2ColMap);
        }

        public List<DataColumn> parseList(Cell cell) {
            String textMultiValue = cell.getString();

            if (textMultiValue != null
                    && textMultiValue.equalsIgnoreCase(ScreenResultWorkbookSpecification.EXCLUDE_ALL_VALUE)) {
                return new ArrayList<DataColumn>(_columnLabel2ColMap.values());
            }

            if (textMultiValue == null) {
                return Collections.emptyList();
            }

            return super.parseList(cell);
        }
    }
}