Java tutorial
/********************************************************************************* * Ephesoft is a Intelligent Document Capture and Mailroom Automation program * developed by Ephesoft, Inc. Copyright (C) 2015 Ephesoft Inc. * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU Affero General Public License version 3 as published by the * Free Software Foundation with the addition of the following permission added * to Section 15 as permitted in Section 7(a): FOR ANY PART OF THE COVERED WORK * IN WHICH THE COPYRIGHT IS OWNED BY EPHESOFT, EPHESOFT DISCLAIMS THE WARRANTY * OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more * details. * * You should have received a copy of the GNU Affero General Public License along with * this program; if not, see http://www.gnu.org/licenses or write to the Free * Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301 USA. * * You can contact Ephesoft, Inc. headquarters at 111 Academy Way, * Irvine, CA 92617, USA. or at email address info@ephesoft.com. * * The interactive user interfaces in modified source and object code versions * of this program must display Appropriate Legal Notices, as required under * Section 5 of the GNU Affero General Public License version 3. * * In accordance with Section 7(b) of the GNU Affero General Public License version 3, * these Appropriate Legal Notices must retain the display of the "Ephesoft" logo. * If the display of the logo is not reasonably feasible for * technical reasons, the Appropriate Legal Notices must display the words * "Powered by Ephesoft". ********************************************************************************/ package com.ephesoft.dcma.tableextraction; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import javax.script.ScriptException; import org.apache.commons.collections.CollectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Component; import com.ephesoft.dcma.batch.schema.Batch; import com.ephesoft.dcma.batch.schema.Column; import com.ephesoft.dcma.batch.schema.Coordinates; import com.ephesoft.dcma.batch.schema.DataTable; import com.ephesoft.dcma.batch.schema.DocField; import com.ephesoft.dcma.batch.schema.Document; import com.ephesoft.dcma.batch.schema.HeaderRow; import com.ephesoft.dcma.batch.schema.HocrPages; import com.ephesoft.dcma.batch.schema.Page; import com.ephesoft.dcma.batch.schema.Row; import com.ephesoft.dcma.batch.schema.Batch.Documents; import com.ephesoft.dcma.batch.schema.Document.DataTables; import com.ephesoft.dcma.batch.schema.Document.DocumentLevelFields; import com.ephesoft.dcma.batch.schema.HocrPages.HocrPage; import com.ephesoft.dcma.batch.schema.HocrPages.HocrPage.Spans.Span; import com.ephesoft.dcma.batch.service.BatchSchemaService; import com.ephesoft.dcma.batch.service.PluginPropertiesService; import com.ephesoft.dcma.common.HocrUtil; import com.ephesoft.dcma.common.LineDataCarrier; import com.ephesoft.dcma.core.common.CategoryType; import com.ephesoft.dcma.core.common.CurrencyCode; import com.ephesoft.dcma.core.common.ExpressionEvaluator; import com.ephesoft.dcma.core.common.TableColumnVO; import com.ephesoft.dcma.core.exception.DCMAApplicationException; import com.ephesoft.dcma.da.domain.BatchClass; import com.ephesoft.dcma.da.domain.DocumentType; import com.ephesoft.dcma.da.domain.TableColumnExtractionRule; import com.ephesoft.dcma.da.domain.TableColumnsInfo; import com.ephesoft.dcma.da.domain.TableExtractionRule; import com.ephesoft.dcma.da.domain.TableInfo; import com.ephesoft.dcma.da.domain.TableRuleInfo; import com.ephesoft.dcma.da.service.BatchClassService; import com.ephesoft.dcma.da.service.DocumentTypeService; import com.ephesoft.dcma.da.service.TableInfoService; import com.ephesoft.dcma.tablefinder.constants.TableExtractionConstants; import com.ephesoft.dcma.tablefinder.data.DataCarrier; import com.ephesoft.dcma.tablefinder.service.TableFinderService; import com.ephesoft.dcma.tablefinder.share.ColumnCoordinates; import com.ephesoft.dcma.tablefinder.share.ColumnHeaderExtractionDataCarrier; import com.ephesoft.dcma.tablefinder.share.DataTableService; import com.ephesoft.dcma.tablefinder.share.RegexValidationDataCarrier; import com.ephesoft.dcma.tablefinder.share.TableExtractionAPI; import com.ephesoft.dcma.tablefinder.share.TableExtractionAPIResult; import com.ephesoft.dcma.tablefinder.share.TableExtractionResult; import com.ephesoft.dcma.tablefinder.share.TableExtractionResultModifierUtility; import com.ephesoft.dcma.tablefinder.share.TableRowFinderUtility; import com.ephesoft.dcma.util.CollectionUtil; import com.ephesoft.dcma.util.CurrencyUtil; import com.ephesoft.dcma.util.EphesoftStringUtil; import com.ephesoft.dcma.util.NumberUtil; /** * This class is responsible to extract table grid data from the hOCR files(html files with HOCR text) from image files. Service will * read all the pages one by one and search some pattern corresponding to document type and update the batch xml file with data tables * values. * * @author Ephesoft * @version 1.0 * @see com.ephesoft.dcma.tableextraction.service.TableExtractionService * */ @Component public class TableExtraction { /** * LOGGER to print the logging information. */ private static final Logger LOGGER = LoggerFactory.getLogger(TableExtraction.class); private static final String TABLE_EXTRACTION_PLUGIN = "TABLE_EXTRACTION"; /** * Constant for document identifier */ private final String documentIdetifier = "DOC"; /** * Reference of BatchSchemaService. */ @Autowired private BatchSchemaService batchSchemaService; /** * Reference of TableInfoService. */ @Autowired private TableInfoService tableInfoService; /** * Reference of TableFinderService. */ @Autowired private TableFinderService tableFinderService; @Autowired @Qualifier("batchInstancePluginPropertiesService") private PluginPropertiesService pluginPropertiesService; @Autowired private DocumentTypeService documentTypeService; @Autowired @Qualifier("batchClassPluginPropertiesService") private PluginPropertiesService batchClassPluginPropertiesService; @Autowired private BatchClassService batchClassService; Locale defaultLocale = Locale.getDefault(); /** * Invalid charactes which needs to remove before applying table rule for the extracted column value. */ private String invalidRuleCharacters; /** * This method is used to extract the document level fields using key value based extraction. Update the extracted data to the * batch.xml file. * * @param batchInstanceIdentifier * @return * @throws DCMAApplicationException */ public final boolean extractFields(final String batchInstanceIdentifier) throws DCMAApplicationException { final String switchValue = pluginPropertiesService.getPropertyValue(batchInstanceIdentifier, TABLE_EXTRACTION_PLUGIN, TableExtractionProperties.TABLE_EXTRACTION_SWITCH); boolean isSuccessful = true; if (("ON").equalsIgnoreCase(switchValue)) { String errMsg = null; setInvalidRuleCharacters(tableFinderService.getInvalidRuleCharacters()); final int gapBetweenColumnWords = tableFinderService.getGapBetweenColumnWords(); if (null == batchInstanceIdentifier) { errMsg = "Invalid batchInstanceId."; LOGGER.error(errMsg); throw new DCMAApplicationException(errMsg); } LOGGER.info("batchInstanceIdentifier : " + batchInstanceIdentifier); final Batch batch = batchSchemaService.getBatch(batchInstanceIdentifier); try { final List<Document> docTypeList = batch.getDocuments().getDocument(); if (null == docTypeList) { LOGGER.info("In valid batch documents."); } else { processDocPage(docTypeList, batchInstanceIdentifier, batch, gapBetweenColumnWords); } } catch (final DCMAApplicationException e) { isSuccessful = false; LOGGER.error(e.getMessage()); throw new DCMAApplicationException(e.getMessage(), e); } catch (final Exception e) { isSuccessful = false; LOGGER.error(e.getMessage()); throw new DCMAApplicationException(e.getMessage(), e); } batchSchemaService.updateBatch(batch); } else { LOGGER.info("Skipping Table extraction. Switch set as off."); } return isSuccessful; } /** * This method will process for each page for each document. * * @param xmlDocuments List<DocumentType> * @param batchInstanceIdentifier String * @param batch Batch * @param gapBetweenColumnWords * @return isSuccessful * @throws DCMAApplicationException Check for all the input parameters. */ private boolean processDocPage(final List<Document> xmlDocuments, final String batchInstanceIdentifier, final Batch batch, final int gapBetweenColumnWords) throws DCMAApplicationException { boolean isSuccessful = false; if (null == xmlDocuments || xmlDocuments.isEmpty()) { throw new DCMAApplicationException("In valid parameters."); } for (final Document document : xmlDocuments) { // Create doc level fields for document. createDocLevelFields(document, batchInstanceIdentifier); if (null == document) { continue; } final String docTypeNameBatch = document.getType(); if (null == docTypeNameBatch || docTypeNameBatch.isEmpty()) { continue; } final List<Page> pageList = document.getPages().getPage(); if (null == pageList || pageList.isEmpty()) { continue; } final String batchClassIdentifier = batch.getBatchClassIdentifier(); if (null == batchClassIdentifier || batchClassIdentifier.isEmpty()) { throw new DCMAApplicationException("Batch class identifier is null or empty..."); } final List<TableInfo> tableInfoList = tableInfoService.getTableInfoByDocTypeName(docTypeNameBatch, batchClassIdentifier); if (null == tableInfoList || tableInfoList.isEmpty()) { LOGGER.info("Table info list is null or empty."); continue; } DataTables dataTables = document.getDataTables(); if (null == dataTables) { dataTables = new DataTables(); document.setDataTables(dataTables); } final List<DataTable> dataTableList = dataTables.getDataTable(); isSuccessful = readAllDataTables(tableInfoList, dataTableList, pageList, batchInstanceIdentifier, gapBetweenColumnWords); } return isSuccessful; } /** * This method reads all table info and extracts table accordingly. * * @param tableInfoList * @param startDataCarrier * @param lineDataCarrierList * @param dataTableList * @param pageList * @param gapBetweenColumnWords * @return boolean * @throws DCMAApplicationException */ private boolean readAllDataTables(final List<TableInfo> tableInfoList, final List<DataTable> dataTableList, final List<Page> pageList, final String batchInstanceIdentifier, final int gapBetweenColumnWords) throws DCMAApplicationException { DataTableService tableExtractionUtility = new DataTableService(); for (final TableInfo tableInfo : tableInfoList) { if (null == tableInfo) { LOGGER.info("Table info is null."); continue; } final String tableName = tableInfo.getName(); LOGGER.info(EphesoftStringUtil.concatenate("Extracting data for table : ", tableName)); if (EphesoftStringUtil.isNullOrEmpty(tableName)) { throw new DCMAApplicationException("Table name is null or empty."); } final DataTable dataTable = new DataTable(); dataTable.setName(tableName); dataTableList.add(dataTable); tableExtractionUtility.initializeHeaderRow(dataTable); tableExtractionUtility.initializeDataTableRows(dataTable); final List<TableColumnsInfo> tableColumnsInfoList = tableInfo.getTableColumnsInfo(); if (CollectionUtils.isEmpty(tableColumnsInfoList)) { LOGGER.error("Table Columns Info list is null or empty."); continue; } // Changed for bug #12556: Table ExtractionIf NO Table Extraction Rule is defined then on Validation screen Table // icon is // displayed but no Table on clicking that icon. HeaderRow headerRow = dataTable.getHeaderRow(); HeaderRow.Columns columnsHeader = headerRow.getColumns(); final List<Column> columnHeaderList = columnsHeader.getColumn(); tableExtractionUtility.createColumnHeaders(columnHeaderList, tableColumnsInfoList); final List<TableExtractionRule> tableExtractionRuleList = tableInfo.getTableExtractionRules(); // Getting attributes for extracting table data. final CurrencyCode tableCurrencyCode = tableInfo.getCurrencyCode(); final String ruleOperatorDefinedForTable = tableInfo.getRuleOperator(); final boolean isRemoveInvalidRows = tableInfo.isRemoveInvalidRows(); final List<TableRuleInfo> tableValidationRulesList = tableInfo.getTableRuleInfo(); LOGGER.debug(EphesoftStringUtil.concatenate("Currency for the table is: ", tableCurrencyCode, "\n Rule Operator for table is: ", ruleOperatorDefinedForTable, "\n Do we need to remove invalid rows?: ", isRemoveInvalidRows)); List<LineDataCarrier> lineDataCarrierList = null; // Map to store the column header info against column name. // changed to avoid memory wastage. Map<String, DataCarrier> colHeaderInfoMap = null; // Maps whether a column is currency or not. If column field is currency then its currency code is stored. // Currently we are storing one locale per table. so value would be same for all keys. But this representation would be // helpful // when in any scenario currency support would be at column level. Map<String, CurrencyCode> columnCurrencyMap = null; List<TableColumnVO> tableColumnVOList = null; int noOfTableColumns = tableColumnsInfoList.size(); List<TableExtractionResult> allTableRowsList = new ArrayList<TableExtractionResult>( tableExtractionRuleList.size()); if (!CollectionUtils.isEmpty(tableExtractionRuleList)) { for (TableExtractionRule tableExtractionRule : tableExtractionRuleList) { if (null != tableExtractionRule) { final String startPattern = tableExtractionRule.getStartPattern(); if (EphesoftStringUtil.isNullOrEmpty(startPattern)) { LOGGER.error("Rule: ", tableExtractionRule.getRuleName(), "'s start pattern is null or empty."); continue; } final String endPattern = tableExtractionRule.getEndPattern(); TableExtractionAPI tableExtractionAPI = tableExtractionUtility .getTableExtractionAPI(tableExtractionRule.getTableAPI()); final List<TableColumnExtractionRule> tableColumnExtractionRuleList = tableExtractionRule .getTableColumnExtractionRules(); // getting all table column info for extraction. tableColumnVOList = tableExtractionUtility.getTableColumnData(tableColumnsInfoList, tableColumnVOList, noOfTableColumns, tableColumnExtractionRuleList); // Searching all rows of the page. lineDataCarrierList = TableRowFinderUtility.searchAllRowOfTables(pageList, startPattern, endPattern, batchInstanceIdentifier, tableFinderService.getFuzzyMatchThresholdValue()); if (CollectionUtils.isNotEmpty(lineDataCarrierList)) { boolean colHeaderValidationRequired = tableExtractionAPI .isColHeaderValidationRequired(); boolean colCoordValidationRequired = tableExtractionAPI.isColCoordValidationRequired(); // Get extracted column header value mapped with column headers. if (colHeaderValidationRequired) { colHeaderInfoMap = tableExtractionUtility.getColumnHeaderMap(colHeaderInfoMap, tableColumnVOList); // call method to populate the columnHeaderInfoMap. tableExtractionUtility.setColumnHeaderInfo(lineDataCarrierList, colHeaderInfoMap, tableColumnVOList, tableFinderService.getFuzzyMatchThresholdValue()); } boolean isTableColumnListDirty = tableExtractionUtility .sortTableColumnsInOrderOfOccurance(colHeaderInfoMap, tableColumnVOList, colHeaderValidationRequired, colCoordValidationRequired); final List<Row> rowList = addDataTablesValues(lineDataCarrierList, tableColumnVOList, colHeaderInfoMap, tableExtractionAPI, gapBetweenColumnWords); if (CollectionUtils.isNotEmpty(rowList)) { columnCurrencyMap = tableExtractionUtility.getColumnCurrencyMap(tableCurrencyCode, columnCurrencyMap, noOfTableColumns, tableColumnExtractionRuleList); // Multiple rule validation implementation. applyTableValidationRulesOnRows(ruleOperatorDefinedForTable, tableValidationRulesList, isRemoveInvalidRows, columnCurrencyMap, rowList); if (null != columnCurrencyMap) { columnCurrencyMap.clear(); } allTableRowsList .add(new TableExtractionResult(rowList, tableExtractionRule.getRuleName())); } // Data extracted for columns in rows is different order from ui displayed table columns. Assign values to // correct column. if (isTableColumnListDirty) { for (TableExtractionResult list : allTableRowsList) { if (null != list) { tableExtractionUtility.sortTableRowsInUiOrder(tableColumnsInfoList, list.getRowList()); } } } if (null != colHeaderInfoMap) { colHeaderInfoMap.clear(); } lineDataCarrierList.clear(); } if (null != tableColumnVOList) { tableColumnVOList.clear(); } } } } // Find the table with maximum valid data. tableExtractionUtility.addBestRuleRows(dataTable, allTableRowsList); List<Row> dataTableRowList = dataTable.getRows().getRow(); if (CollectionUtil.isEmpty(dataTableRowList)) { // Adding by default one empty row in table results if no data was extracted from table extraction rules. // This row has application of validation patterns on empty columns and table validation rules on the row. insertEmptyRow(dataTableRowList, tableColumnsInfoList); applyTableValidationRulesOnRows(ruleOperatorDefinedForTable, tableValidationRulesList, isRemoveInvalidRows, columnCurrencyMap, dataTableRowList); } } return true; } /** * Inserts an empty row to a list of table rows. * * @param dataTableRowList {@link List}<{@link Row}> * @param tableColumnsInfoList {@link List}<{@link TableColumnsInfo}> */ private void insertEmptyRow(List<Row> dataTableRowList, final List<TableColumnsInfo> tableColumnsInfoList) { Row row = new Row(); row.setMannualExtraction(false); row.setIsRuleValid(true); Row.Columns columnsRow = row.getColumns(); if (null == columnsRow) { columnsRow = new Row.Columns(); row.setColumns(columnsRow); } final List<Column> columnRowList = columnsRow.getColumn(); for (final TableColumnsInfo tableColumnsInfo : tableColumnsInfoList) { Column column = new Column(); column.setValid(false); column.setValidationRequired(false); column.setConfidence(0.0f); column.setForceReview(false); column.setOcrConfidence(0.0f); column.setOcrConfidenceThreshold(0.0f); column.setValid(false); column.setValidationRequired(false); column.setName(tableColumnsInfo.getColumnName()); column.setValue(TableExtractionConstants.EMPTY); column.setValid(TableExtractionResultModifierUtility.isValidWithPattern(column.getValue(), tableColumnsInfo.getValidationPattern())); columnRowList.add(column); } dataTableRowList.add(row); } /** * Extracts table data. * * @param lineDataCarrierList {@link List}<{@link LineDataCarrier}> * @param tableColumnList {@link List}<{@link TableColumnVO}> * @param colHeaderInfoMap {@link Map}<{@link String}, {@link DataCarrier}> * @param tableExtractionAPI {@link TableExtractionAPI} * @return {@link List}<{@link Row}> * @throws DCMAApplicationException */ private List<Row> addDataTablesValues(final List<LineDataCarrier> lineDataCarrierList, final List<TableColumnVO> tableColumnList, final Map<String, DataCarrier> colHeaderInfoMap, final TableExtractionAPI tableExtractionAPI, final int gapBetweenColumnWords) throws DCMAApplicationException { List<Row> rowList = null; if (CollectionUtils.isNotEmpty(lineDataCarrierList) && CollectionUtils.isNotEmpty(tableColumnList) && null != tableExtractionAPI) { /** isRowValidForAllMandatoryColumns is a boolean variable used in context of multiline spanning column data in table. */ boolean isRowValidForAllMandatoryColumns; /** previous row to merge with next row if mandatory is set for the column. */ Row previousRow = new Row(); previousRow.setIsRuleValid(false); previousRow.setMannualExtraction(false); boolean isRowAvaliable = false; boolean isRowValidForAllRequiredColumns = true; boolean allRowFinishTasksDone = true; final String tableExtractionAPIString = tableExtractionAPI.getTableExtractionAPI(); final ExpressionEvaluator<Boolean> expressionEvaluator = new ExpressionEvaluator<Boolean>( tableExtractionAPIString.toUpperCase()); int lineIndex = TableExtractionConstants.START_INDEX; boolean isFirstPage = true; String firstPageId = null; if (lineDataCarrierList != null && !lineDataCarrierList.isEmpty()) { firstPageId = lineDataCarrierList.get(0).getPageID(); } LineDataCarrier lineDataCarrier = null; boolean columnHeaderValidationRequired = tableExtractionAPI.isColHeaderValidationRequired(); boolean columnCoordinateValidationRequired = tableExtractionAPI.isColCoordValidationRequired(); boolean regexValidationRequired = tableExtractionAPI.isRegexValidationRequired(); ColumnHeaderExtractionDataCarrier columnHeaderExtractionDataCarrier = null; while (lineIndex < lineDataCarrierList.size()) { lineDataCarrier = lineDataCarrierList.get(lineIndex); final String pageID = lineDataCarrier.getPageID(); if (!pageID.equals(firstPageId)) { isFirstPage = false; } final String rowData = lineDataCarrier.getLineRowData(); final List<Span> spanList = lineDataCarrier.getSpanList(); LOGGER.info(EphesoftStringUtil.concatenate("Row Data : ", rowData)); isRowValidForAllRequiredColumns = true; isRowValidForAllMandatoryColumns = true; final Row row = new Row(); row.setIsRuleValid(false); row.setMannualExtraction(false); // By default setting rule validation as true. row.setIsRuleValid(true); Row.Columns columnsRow = TableRowFinderUtility.getRowColumns(lineDataCarrier, row); final List<Column> columnRowList = columnsRow.getColumn(); isRowAvaliable = false; for (final TableColumnVO tableColumn : tableColumnList) { LOGGER.info(EphesoftStringUtil.concatenate("Extracting column data for column = ", tableColumn.getColumnName())); // Search for all the table row data one by one. final Column column = new Column(); column.setValid(false); column.setValidationRequired(false); column.setConfidence(0.0f); column.setForceReview(false); column.setOcrConfidence(0.0f); column.setOcrConfidenceThreshold(0.0f); column.setValid(false); column.setValidationRequired(false); TableRowFinderUtility.setColumnProperties(pageID, column, null, 0); column.setName(tableColumn.getColumnName()); columnRowList.add(column); DataCarrier colHeaderDataCarrier = null; final Integer indexOfTableColumn = tableColumnList.indexOf(tableColumn); boolean isRegexValidationPassed = false; boolean isColHeaderValidationPassed = false; boolean isColCoordValidationPassed = false; boolean furtherValidationRequired = true; Coordinates valueCoordinates = null; // This confidence parameter for column header extraction mechanism indicates whether result obtained from this // extraction matches 100% to the validation pattern. If it doesn't match, we search to look for extraction results // by further algorithms (if applicable). boolean isColumnHeaderExtractionConfidenceMaximum = false; // This confidence parameter for column coordinate extraction mechanism indicates whether result obtained from this // extraction matches 100% to the validation pattern. If it doesn't match, we search to look for extraction results // by further algorithms (if applicable). boolean isColumnCoordinateExtractionConfidenceMaximum = false; if (columnHeaderValidationRequired) { if (null != colHeaderInfoMap && !colHeaderInfoMap.isEmpty()) { colHeaderDataCarrier = colHeaderInfoMap.get(tableColumn.getColumnName()); } if (null == columnHeaderExtractionDataCarrier) { columnHeaderExtractionDataCarrier = new ColumnHeaderExtractionDataCarrier(); } columnHeaderExtractionDataCarrier.setColHeaderDataCarrier(colHeaderDataCarrier); columnHeaderExtractionDataCarrier.setColumn(column); columnHeaderExtractionDataCarrier.setLineDataCarrier(lineDataCarrier); columnHeaderExtractionDataCarrier.setSpanList(spanList); columnHeaderExtractionDataCarrier.setTableColumn(tableColumn); TableExtractionAPIResult tableExtractionAPIResult = TableRowFinderUtility .runColumnHeaderExtraction(columnHeaderExtractionDataCarrier, gapBetweenColumnWords, isFirstPage); valueCoordinates = tableExtractionAPIResult.getValueCoordinates(); isColumnHeaderExtractionConfidenceMaximum = tableExtractionAPIResult.isConfidenceMaximum(); isColHeaderValidationPassed = tableExtractionAPIResult.isValidationPassed(); } if (TableRowFinderUtility.isDataValid(expressionEvaluator, false, isColumnHeaderExtractionConfidenceMaximum, false) || !TableRowFinderUtility.isDataValid(expressionEvaluator, true, isColHeaderValidationPassed, true)) { LOGGER.info("No further validation required....."); furtherValidationRequired = false; } if (furtherValidationRequired && columnCoordinateValidationRequired) { LOGGER.info("Applying Column Coordinates Validation for table extraction...."); if (TableRowFinderUtility.isDataValid(expressionEvaluator, false, false, true)) { TableExtractionAPIResult tableExtractionAPIResult = TableRowFinderUtility .runColumnCoordinateExtraction(pageID, tableColumn, column, spanList); valueCoordinates = tableExtractionAPIResult.getValueCoordinates(); isColumnCoordinateExtractionConfidenceMaximum = tableExtractionAPIResult .isConfidenceMaximum(); isColCoordValidationPassed = tableExtractionAPIResult.isValidationPassed(); } else if (isColumnHeaderExtractionConfidenceMaximum && valueCoordinates != null) { ColumnCoordinates columnCoordinates = TableRowFinderUtility .getXColumncoordinates(tableColumn); if (TableRowFinderUtility.isColumnValidWithColCoord(valueCoordinates.getX0().intValue(), valueCoordinates.getX1().intValue(), columnCoordinates.getX0Coordinate(), columnCoordinates.getX1coordinate())) { isColCoordValidationPassed = true; isColumnCoordinateExtractionConfidenceMaximum = true; column.setValid(Boolean.TRUE); LOGGER.info("Getting rectangle coordinates for the value. "); valueCoordinates = HocrUtil .getRectangleCoordinates(column.getCoordinatesList().getCoordinates()); } } else { TableExtractionAPIResult tableExtractionAPIResult = TableRowFinderUtility .runColumnCoordinateExtraction(pageID, tableColumn, column, spanList); valueCoordinates = tableExtractionAPIResult.getValueCoordinates(); isColumnCoordinateExtractionConfidenceMaximum = tableExtractionAPIResult .isConfidenceMaximum(); isColCoordValidationPassed = tableExtractionAPIResult.isValidationPassed(); } // Checking if column header validation and/or column coordinate validation results are confident enough with // respect to table extraction API needed. if ((TableRowFinderUtility.isDataValid(expressionEvaluator, false, isColumnHeaderExtractionConfidenceMaximum, isColumnCoordinateExtractionConfidenceMaximum) || !TableRowFinderUtility.isDataValid(expressionEvaluator, true, isColHeaderValidationPassed, isColCoordValidationPassed))) { LOGGER.info("No further validation required...."); furtherValidationRequired = false; } } if (furtherValidationRequired && regexValidationRequired) { RegexValidationDataCarrier regexvalidationDataCarrier = new RegexValidationDataCarrier(); regexvalidationDataCarrier.setColumn(column); regexvalidationDataCarrier.setColumnCoordinates(valueCoordinates); regexvalidationDataCarrier.setIndexOfTableColumn(indexOfTableColumn); regexvalidationDataCarrier.setLineDataCarrier(lineDataCarrier); regexvalidationDataCarrier.setPageID(pageID); regexvalidationDataCarrier.setRowData(rowData); regexvalidationDataCarrier.setSpanList(spanList); regexvalidationDataCarrier.setTableColumn(tableColumn); isRegexValidationPassed = TableRowFinderUtility.runRegexValidation(expressionEvaluator, columnHeaderValidationRequired, columnCoordinateValidationRequired, columnRowList, colHeaderDataCarrier, regexvalidationDataCarrier); } if (TableRowFinderUtility.isDataValid(expressionEvaluator, isRegexValidationPassed, isColHeaderValidationPassed, isColCoordValidationPassed)) { isRowAvaliable = true; LOGGER.info("Data valid with respect to all the validations.."); } else { TableRowFinderUtility.setColumnProperties(pageID, column, TableExtractionConstants.EMPTY, 0); } column.setName(tableColumn.getColumnName()); LOGGER.info("Checking if row contains valid data for mandatory columns....."); if (tableColumn.isMultilineAnchor() && (column.getValue() == null || column.getValue().isEmpty())) { isRowValidForAllMandatoryColumns = false; } } if (isRowAvaliable) { LOGGER.info("Row is available."); if (null == rowList) { rowList = new ArrayList<Row>(); } /** * if executes if current row has data to be merged with last added row to rowList, because of missing content in * atleast one mandatory column. */ if (!isRowValidForAllMandatoryColumns && null != previousRow && previousRow.getColumns() != null) { final List<Column> previousRowColumnsList = previousRow.getColumns().getColumn(); LOGGER.info("Merging rows of multiline data..."); TableExtractionResultModifierUtility.mergeMultilineRows(previousRowColumnsList, columnRowList); TableExtractionResultModifierUtility.setMergedRowCoordinates(previousRow, row); } else { // Note: previousRow.getColumns() == null if for first row isRowValidForAllRequiredColumns = TableExtractionResultModifierUtility .finishTasksForPreviousRow(tableColumnList, rowList, isRowValidForAllRequiredColumns, previousRow, lineDataCarrier); if (!isRowValidForAllRequiredColumns) { LOGGER.debug( "Removed previous row from rowList as required column data was missing from it."); } allRowFinishTasksDone = false; LOGGER.info("Adding a new row to the row list..."); rowList.add(row); previousRow = row; } } lineIndex++; } // following executes for last row in extracted table data. if (!allRowFinishTasksDone) { isRowValidForAllRequiredColumns = TableExtractionResultModifierUtility.finishTasksForPreviousRow( tableColumnList, rowList, isRowValidForAllRequiredColumns, previousRow, lineDataCarrier); if (!isRowValidForAllRequiredColumns) { LOGGER.debug("Removed previous row from rowList as required column data was missing from it."); } } } return rowList; } /** * Apply table row validation of each row of list. * * @param ruleOperatorDefinedForTable {@link String} * @param listofTableRules {@link List}<{@link TableRuleInfo}> * @param isRemoveInvalidRows boolean * @param columnCurrencyMap {@link Map<{@link String}, {@link CurrencyCode}> * @param rowList {@link List}<{@link Row}> */ private void applyTableValidationRulesOnRows(final String ruleOperatorDefinedForTable, final List<TableRuleInfo> listofTableRules, final boolean isRemoveInvalidRows, final Map<String, CurrencyCode> columnCurrencyMap, List<Row> rowList) { if (rowList != null && !rowList.isEmpty()) { // Added to delete rows with invalid row as per rule when Remove invalid rows is true. final List<Row> listToBeDeleted = new ArrayList<Row>(rowList.size()); StringBuilder invalidCharactesString = null; if (!EphesoftStringUtil.isNullOrEmpty(invalidRuleCharacters)) { final String[] inValidCharcters = invalidRuleCharacters.split(TableExtractionConstants.SEMI_COLON); if (inValidCharcters != null && inValidCharcters.length > 0) { invalidCharactesString = new StringBuilder(); for (int index = 0; index < inValidCharcters.length; index++) { invalidCharactesString.append(inValidCharcters[index]); } } } for (Row tableRow : rowList) { // if multiple rule validation operator is "and". if (TableExtractionConstants.AND_OPERATOR.equalsIgnoreCase(ruleOperatorDefinedForTable)) { for (int index = 0; index < listofTableRules.size(); index++) { if (!isRuleValid(listofTableRules.get(index).getRule(), tableRow, invalidCharactesString, columnCurrencyMap)) { setRowForInvalidRule(tableRow, listToBeDeleted); break; } else { setRowForInvalidRule(tableRow, listToBeDeleted); } } } else { for (int index = 0; index < listofTableRules.size(); index++) { if (!isRuleValid(listofTableRules.get(index).getRule(), tableRow, invalidCharactesString, columnCurrencyMap)) { setRowForInvalidRule(tableRow, listToBeDeleted); } else { setRowForValidRule(tableRow, listToBeDeleted); tableRow.setRuleViolated(null); break; } } } } // Removes invalid rows based on rule defined if this property has been set as checked. if (isRemoveInvalidRows) { rowList.removeAll(listToBeDeleted); } } } /** * This method creates new document level fields for document if they haven't been created by previous plugins. * * @param eachDocType * @param batchInstanceIdentifier */ private void createDocLevelFields(final Document eachDocType, final String batchInstanceIdentifier) { DocumentLevelFields documentLevelFields = eachDocType.getDocumentLevelFields(); if (documentLevelFields == null) { documentLevelFields = new DocumentLevelFields(); eachDocType.setDocumentLevelFields(documentLevelFields); } final List<DocField> docLevelFields = documentLevelFields.getDocumentLevelField(); if (docLevelFields == null || docLevelFields.isEmpty()) { LOGGER.info("Getting document level fields for document type : " + eachDocType.getType()); final List<com.ephesoft.dcma.da.domain.FieldType> allFdTypes = pluginPropertiesService .getFieldTypes(batchInstanceIdentifier, eachDocType.getType()); if (allFdTypes != null) { for (final com.ephesoft.dcma.da.domain.FieldType fdType : allFdTypes) { // Create new document level field LOGGER.info("Creating new document level field"); final DocField docLevelField = new DocField(); docLevelField.setName(fdType.getName()); docLevelField.setFieldOrderNumber(fdType.getFieldOrderNumber()); docLevelField.setType(fdType.getDataType().name()); docLevelField.setFieldValueChangeScript(fdType.isFieldValueChangeScriptEnabled()); if (null != fdType.getCategoryName()) { docLevelField.setCategory(fdType.getCategoryName()); } else { // docLevelField.setCategory("Uncategorised"); docLevelField.setCategory(CategoryType.GROUP_1.getCategoryName()); } docLevelField.setHidden(fdType.isHidden()); // Object newValue = getValueForDocField(fdType.getName(), allColumnNames, extractedData); // docLevelField.setValue(newValue.toString()); // Add new document level field to document. docLevelFields.add(docLevelField); LOGGER.info("New doc level field added : " + docLevelField.getName()); } } else { LOGGER.info("No field types could be found for document type :" + eachDocType.getType()); } } } /** * Returns true if rule defined for table is valid otherwise returns false. * * @param ruleDefinedForTable {@link String} The rule defined for a table. * @param row {@link Row} The row of a table. * @param columnCurrencyMap {@link Map}<{@link String}, {@link CurrencyCode}> is used to map the currency codes corresponding to * the different column names. If null currency extraction is not performed. * * @return true if rule defined for table is valid otherwise returns false. * @throws Exception {@link Exception} */ private boolean isRuleValid(final String ruleDefinedForTable, final Row row, final StringBuilder invalidCharactesString, final Map<String, CurrencyCode> columnCurrencyMap) { boolean isValid = false; try { if (!EphesoftStringUtil.isNullOrEmpty(ruleDefinedForTable)) { Map<String, Integer> relationalOperators = createRelationalOperatorsMap(); String selectedOperator = getSelectedRelationalOperator(ruleDefinedForTable, relationalOperators); final int indexOfRuleDefined = ruleDefinedForTable.indexOf(selectedOperator); if (indexOfRuleDefined != -1) { final String ruleDefinedRHS = ruleDefinedForTable.substring( indexOfRuleDefined + selectedOperator.length(), ruleDefinedForTable.length()); final String ruleDefinedLHS = ruleDefinedForTable.substring(0, indexOfRuleDefined - 1); if (!EphesoftStringUtil.isNullOrEmpty(ruleDefinedLHS) && !EphesoftStringUtil.isNullOrEmpty(ruleDefinedRHS)) { final String[] columnsRHS = EphesoftStringUtil.splitString(ruleDefinedRHS.trim(), TableExtractionConstants.AMPERSAND_STRING); final String[] columnsLHS = EphesoftStringUtil.splitString(ruleDefinedLHS.trim(), TableExtractionConstants.AMPERSAND_STRING); if (columnsLHS != null && columnsLHS.length > 0 && columnsRHS != null && columnsRHS.length > 0) { final StringBuilder ruleModifiedForExpressionEvaluatorLHS = new StringBuilder(); final StringBuilder ruleModifiedForExpressionEvaluatorRHS = new StringBuilder(); final Map<String, Object> columnValuesMapLHS = getColumnValuesMap(row, getColumnNamesMap(ruleModifiedForExpressionEvaluatorLHS, columnsLHS), invalidCharactesString, columnCurrencyMap); if (columnValuesMapLHS != null && columnValuesMapLHS.get(TableExtractionConstants.TABLE_RULE_KEY) == null) { final Map<String, Object> columnValuesMapRHS = getColumnValuesMap(row, getColumnNamesMap(ruleModifiedForExpressionEvaluatorRHS, columnsRHS), invalidCharactesString, columnCurrencyMap); if (columnValuesMapRHS != null && columnValuesMapRHS .get(TableExtractionConstants.TABLE_RULE_KEY) == null) { final ExpressionEvaluator<Double> expressionEvaluatorLHS = getColumnsExpressionEvaluator( ruleModifiedForExpressionEvaluatorLHS.toString(), columnValuesMapLHS); final ExpressionEvaluator<Double> expressionEvaluatorRHS = getColumnsExpressionEvaluator( ruleModifiedForExpressionEvaluatorRHS.toString(), columnValuesMapRHS); if (expressionEvaluatorLHS != null && expressionEvaluatorRHS != null) { isValid = checkExpressionsForSelectedInequality(relationalOperators, selectedOperator, expressionEvaluatorLHS, expressionEvaluatorRHS); } if (!isValid) { row.setRuleViolated(ruleDefinedForTable); } } } } } } } } catch (final Exception exception) { LOGGER.error("Exception occurred while validating rule for a table row" + exception.getMessage()); row.setRuleViolated(TableExtractionConstants.INVALID_DATA_ERROR_MSG); } return isValid; } /** * Checks the passed expressions for relational inequalities like '==','<=','>=','!='. * * @param relationalOperators {@link Map} map of relational operators. * @param selectedOperator {@link String} selected operator. * @param expressionEvaluatorLHS {@link String} expression on the right hand side of equality. * @param expressionEvaluatorLHS {@link String} expression on the left hand side of equality. * @return {@link boolean} true if the inequality is satisfied, false otherwise. */ private boolean checkExpressionsForSelectedInequality(Map<String, Integer> relationalOperators, String selectedOperator, final ExpressionEvaluator<Double> expressionEvaluatorLHS, final ExpressionEvaluator<Double> expressionEvaluatorRHS) throws ScriptException, ClassCastException { boolean isValid = false; final Double expressionResultLHS = expressionEvaluatorLHS.eval(); final Double expressionResultRHS = expressionEvaluatorRHS.eval(); if (selectedOperator != null && expressionResultLHS != null && expressionResultRHS != null) { // Formatting required because of results provided by the engine were not formatted. final double expressionLHSValue = NumberUtil.getRoundedValue(expressionResultLHS); final double expressionRHSValue = NumberUtil.getRoundedValue(expressionResultRHS); switch (relationalOperators.get(selectedOperator)) { case 0: isValid = (expressionLHSValue == expressionRHSValue); break; case 1: isValid = (expressionLHSValue != expressionRHSValue); break; case 2: isValid = (expressionLHSValue >= expressionRHSValue); break; case 3: isValid = (expressionLHSValue <= expressionRHSValue); break; default: } } return isValid; } /** * Returns the relational operator used in the rule. * * @param ruleDefinedForTable {@link String} the table rule. * @param relationalOperators {@link Map} map of all the relational operators. * @return {@link String} the relational operator used in the rule. */ private String getSelectedRelationalOperator(final String ruleDefinedForTable, Map<String, Integer> relationalOperators) { String selectedOperator = null; Set<String> operators = relationalOperators.keySet(); for (String operator : operators) { if (ruleDefinedForTable.indexOf(operator) != -1) { selectedOperator = operator; } } return selectedOperator; } /** * Creates a map of relational operators that may occur in an expression. * * @return {@link Map} map of relational operators. */ private Map<String, Integer> createRelationalOperatorsMap() { Map<String, Integer> relationalOperators = new HashMap<String, Integer>(); relationalOperators.put(TableExtractionConstants.CHAR_EQUAL, 0); relationalOperators.put(TableExtractionConstants.NOT_EQUAL, 1); relationalOperators.put(TableExtractionConstants.GREATER_OR_EQUAL, 2); relationalOperators.put(TableExtractionConstants.LESS_OR_EQUAL, 3); return relationalOperators; } /** * Returns the instance of {@link ExpressionEvaluator} for the rule defined for a particular table and table columns. * * @param ruleDefined {@link String} Rule defined for the table. * @param columnsValuesMap {@link Map} contains the map for the table column value. * @return The instance of {@link ExpressionEvaluator}. */ private ExpressionEvaluator<Double> getColumnsExpressionEvaluator(final String ruleDefined, final Map<String, Object> columnsValuesMap) { ExpressionEvaluator<Double> expressionEvaluator = null; if (columnsValuesMap != null && !columnsValuesMap.isEmpty()) { expressionEvaluator = new ExpressionEvaluator<Double>(ruleDefined); final Set<String> columnValuesKeySet = columnsValuesMap.keySet(); final Iterator<String> columnValuesKeyItr = columnValuesKeySet.iterator(); while (columnValuesKeyItr.hasNext()) { final String key = columnValuesKeyItr.next(); expressionEvaluator.putValue(key, columnsValuesMap.get(key)); } } return expressionEvaluator; } /** * Modifies the column name of the table and creates the modified rule used by expression evaluator and returns the table columns * name map. * * @param ruleModifiedForExpressionEvaluator {@link String} The modified rule used by expression evaluator. * @param columns {@link String} The array of table columns. * @return {@link String} The table columns name map. */ private Map<String, String> getColumnNamesMap(final StringBuilder ruleModifiedForExpressionEvaluator, final String[] columns) { Map<String, String> columnNamesMap = null; if (columns != null) { columnNamesMap = new LinkedHashMap<String, String>(); for (int index = 0; index < columns.length; index++) { String tempString = columns[index]; if (tempString != null) { if (index % 2 != 0) { final String columnNamesKey = EphesoftStringUtil.concatenate("ID", index); columnNamesMap.put(columnNamesKey, tempString.trim()); tempString = columnNamesKey; } ruleModifiedForExpressionEvaluator.append(tempString); } } } return columnNamesMap; } /** * Returns the column values map. * * @param row {@link Row} The row of table. * @param columnNamesMap {@link Map} The columns name map, * @param columnCurrencyMap {@link Map} is used to map the currency codes corresponding to the different column names. * @return {@link Map} The column values map. */ private Map<String, Object> getColumnValuesMap(final Row row, final Map<String, String> columnNamesMap, final StringBuilder invalidCharactesString, final Map<String, CurrencyCode> columnCurrencyMap) { Map<String, Object> columnValuesMap = null; if (row != null && columnNamesMap != null) { columnValuesMap = new LinkedHashMap<String, Object>(); Set<String> columnsNameset = columnNamesMap.keySet(); Iterator<String> columnsNameItr = columnsNameset.iterator(); CurrencyCode columnCurrency = null; while (columnsNameItr.hasNext()) { String columnNameKey = columnsNameItr.next(); String columnName = columnNamesMap.get(columnNameKey); if (columnCurrencyMap != null) { columnCurrency = columnCurrencyMap.get(columnName); } String columnValue = getColumnValue(row, columnName, columnCurrency); if (invalidCharactesString != null && !EphesoftStringUtil.isNullOrEmpty(invalidCharactesString.toString())) { columnValue = removeCharacters(columnValue, invalidCharactesString.toString().toCharArray()); } if (!EphesoftStringUtil.isNullOrEmpty(columnValue) && columnValue.matches(TableExtractionConstants.TABLE_RULE_REGEX)) { columnValuesMap.put(columnNameKey, Double.valueOf(columnValue)); } else { columnValuesMap.put(TableExtractionConstants.TABLE_RULE_KEY, false); break; } } } return columnValuesMap; } private String removeCharacters(final String strToBeModified, final char[] charsToBeReplaced) { String resultString = null; if (!EphesoftStringUtil.isNullOrEmpty(strToBeModified) && charsToBeReplaced != null) { StringBuilder resultStringBuilder = new StringBuilder(strToBeModified.length()); char[] words = strToBeModified.toCharArray(); for (int index = 0; index < words.length; index++) { boolean matchFailed = true; for (int charIndex = 0; charIndex < charsToBeReplaced.length; charIndex++) { if (words[index] == charsToBeReplaced[charIndex]) { resultStringBuilder.append(TableExtractionConstants.EMPTY); matchFailed = false; break; } } if (matchFailed) { resultStringBuilder.append(words[index]); } } resultString = resultStringBuilder.toString(); } return resultString; } /** * Returns the value of the columnName passed for the table row. If <code>cuurencyCode</code> is not <code>NULL</code>, then * currencyExtraction rules are applied to the extracted value corresponding to the code. * * @param row {@link Row} row from which the values are to be extracted. * @param columnPassed {@link String} name of the column whose value is to be extracted. * @param currencyCode {@link CurrencyCode} implies the rules for currency extraction. If null currency extraction is not * performed. * @return {@link String} Value extracted corresponding to the column. */ private String getColumnValue(final Row row, final String columnPassed, final CurrencyCode currencyCode) { String extractedValue = getColumnValue(row, columnPassed); if (currencyCode != null) { Double currencyValue = CurrencyUtil.getDoubleValue(extractedValue, currencyCode.getRepresentationValue()); if (currencyValue != null) { extractedValue = currencyValue.toString(); } } LOGGER.info(EphesoftStringUtil.concatenate("Extracted value for ", columnPassed, " is ", extractedValue)); return extractedValue; } /** * Returns the value of the columnName passed for the table row. * * @param row {@link Row} The row of the table. * @param columnPassed {@link String} The column name passed. * @return {@link String} The value of column name passed. */ private String getColumnValue(final Row row, final String columnPassed) { String columnValue = null; if (row != null || columnPassed != null) { Row.Columns columns = row.getColumns(); if (columns != null) { List<Column> columnList = columns.getColumn(); if (columnList != null && !columnList.isEmpty()) { for (Column column : columnList) { if (column != null && columnPassed.trim().equalsIgnoreCase(column.getName())) { columnValue = column.getValue(); break; } } } } } return columnValue; } /** * Sets all the row of a table to invalid. * * @param row{{@link Row} The row of a table. */ private void setAllRowInvalid(final Row row) { if (row != null) { row.setIsRuleValid(false); } } /** * Sets all the row of a table to valid. * * @param row{{@link Row} The row of a table. */ private void setAllRowValid(final Row row) { if (row != null) { row.setIsRuleValid(true); } } /** * API to set row property isRuleValid true when rule validation passes and remove it from the list to be deleted if * isRemoveInvalidRows property is unchecked. * * @param tableRow * @param listToBeDeleted */ private void setRowForValidRule(Row tableRow, List<Row> listToBeDeleted) { setAllRowValid(tableRow); listToBeDeleted.remove(tableRow); } /** * API to set row property isRuleValid false when rule validation fails and add it to the list of rows to be deleted if * isRemoveInvalidRows property is checked * * @param tableRow * @param listToBeDeleted */ private void setRowForInvalidRule(Row tableRow, List<Row> listToBeDeleted) { setAllRowInvalid(tableRow); listToBeDeleted.add(tableRow); } /** * @return the invalidRuleCharacters */ public String getInvalidRuleCharacters() { return invalidRuleCharacters; } /** * @param invalidRuleCharacters the invalidRuleCharacters to set */ public void setInvalidRuleCharacters(final String invalidRuleCharacters) { this.invalidRuleCharacters = invalidRuleCharacters; } /** * returns documents object after adding data tables * * @param gapBetweenColumnWords * @param documentHOCRMap * @param docTypeList * @param docTypesName * @return * @throws DCMAApplicationException */ public Documents processDocPageForTableExtractionWebService(final int gapBetweenColumnWords, final Map<DocumentType, List<HocrPages>> documentHOCRMap, List<DocumentType> docTypeList, List<String> docTypesName) throws DCMAApplicationException { Documents documents = new Batch.Documents(); int identifierIndex = 0; int pageOrder = 0; for (Map.Entry<DocumentType, List<HocrPages>> entry : documentHOCRMap.entrySet()) { ++identifierIndex; DocumentType documentType = entry.getKey(); // int index = docTypesName.indexOf(entry.getKey().getName()); final List<TableInfo> tableInfoList = tableInfoService.getTableInfoByDocumentType(documentType); if (null == tableInfoList || tableInfoList.isEmpty()) { LOGGER.info("Table info list is null or empty."); continue; } Document document = new Document(); document.setType(documentType.getName()); document.setIdentifier(EphesoftStringUtil.concatenate(documentIdetifier, identifierIndex)); documents.getDocument().add(document); final List<HocrPages> pageList = entry.getValue(); if (null == pageList || pageList.isEmpty()) { continue; } for (HocrPages hocrPages : pageList) { for (HocrPage hocrPage : hocrPages.getHocrPage()) { hocrPage.setPageID( EphesoftStringUtil.concatenate(TableExtractionConstants.pageIdentifier, pageOrder)); ++pageOrder; } } DataTables dataTables = document.getDataTables(); if (null == dataTables) { dataTables = new DataTables(); document.setDataTables(dataTables); } final List<DataTable> dataTableList = dataTables.getDataTable(); readAllDataTablesForTableExtractionWebService(tableInfoList, dataTableList, pageList, gapBetweenColumnWords); } return documents; } public Documents processDocPageForTestExtraction(final int gapBetweenColumnWords, final Map<Document, List<HocrPages>> documentHOCRMap, final String batchClassIdentifier) throws DCMAApplicationException { Documents documents = new Batch.Documents(); int identifierIndex = 0; for (Map.Entry<Document, List<HocrPages>> entry : documentHOCRMap.entrySet()) { ++identifierIndex; Document document = entry.getKey(); final List<TableInfo> tableInfoList = tableInfoService.getTableInfoByDocTypeName(document.getType(), batchClassIdentifier); if (null == tableInfoList || tableInfoList.isEmpty()) { LOGGER.info("Table info list is null or empty."); continue; } documents.getDocument().add(document); final List<HocrPages> pageList = entry.getValue(); if (null == pageList || pageList.isEmpty()) { continue; } DataTables dataTables = document.getDataTables(); if (null == dataTables) { dataTables = new DataTables(); document.setDataTables(dataTables); } final List<DataTable> dataTableList = dataTables.getDataTable(); readAllDataTablesForTableExtractionWebService(tableInfoList, dataTableList, pageList, gapBetweenColumnWords); } return documents; } /** * Returns documents object after adding data tables to the exisiting document * * @param gapBetweenColumnWords * @param documentHOCRMap * @param docTypeList * @param docTypesName * @return * @throws DCMAApplicationException */ public void processDocPageForTableExtraction(final int gapBetweenColumnWords, final Map<Document, List<HocrPages>> documentHOCRMap, List<DocumentType> docTypeList, List<String> docTypesName, final boolean isTestExtractionFlow) throws DCMAApplicationException { int identifierIndex = 0; int pageOrder = 0; for (Map.Entry<Document, List<HocrPages>> entry : documentHOCRMap.entrySet()) { ++identifierIndex; Document document = entry.getKey(); int index = docTypesName.indexOf(entry.getKey().getType()); final List<TableInfo> tableInfoList = tableInfoService .getTableInfoByDocumentType(docTypeList.get(index)); if (null == tableInfoList || tableInfoList.isEmpty()) { LOGGER.info("Table info list is null or empty."); continue; } final List<HocrPages> pageList = entry.getValue(); if (null == pageList || pageList.isEmpty()) { continue; } for (HocrPages hocrPages : pageList) { for (HocrPage hocrPage : hocrPages.getHocrPage()) { hocrPage.setPageID( EphesoftStringUtil.concatenate(TableExtractionConstants.pageIdentifier, pageOrder)); ++pageOrder; } } DataTables dataTables = document.getDataTables(); if (null == dataTables) { dataTables = new DataTables(); document.setDataTables(dataTables); } final List<DataTable> dataTableList = dataTables.getDataTable(); readAllDataTablesForTableExtractionWebService(tableInfoList, dataTableList, pageList, gapBetweenColumnWords); } } /** * returns true after extracting table values from the HOCR after applying extraction rules * * @param tableInfoList * @param dataTableList * @param pageList * @param gapBetweenColumnWords * @return * @throws DCMAApplicationException */ private boolean readAllDataTablesForTableExtractionWebService(final List<TableInfo> tableInfoList, final List<DataTable> dataTableList, final List<HocrPages> pageList, final int gapBetweenColumnWords) throws DCMAApplicationException { DataTableService tableExtractionUtility = new DataTableService(); for (final TableInfo tableInfo : tableInfoList) { if (null == tableInfo) { LOGGER.info("Table info is null."); continue; } final String tableName = tableInfo.getName(); LOGGER.info(EphesoftStringUtil.concatenate("Extracting data for table : ", tableName)); if (EphesoftStringUtil.isNullOrEmpty(tableName)) { throw new DCMAApplicationException("Table name is null or empty."); } final DataTable dataTable = new DataTable(); dataTable.setName(tableName); dataTableList.add(dataTable); tableExtractionUtility.initializeHeaderRow(dataTable); tableExtractionUtility.initializeDataTableRows(dataTable); final List<TableColumnsInfo> tableColumnsInfoList = tableInfo.getTableColumnsInfo(); if (CollectionUtils.isEmpty(tableColumnsInfoList)) { LOGGER.error("Table Columns Info list is null or empty."); continue; } HeaderRow headerRow = dataTable.getHeaderRow(); HeaderRow.Columns columnsHeader = headerRow.getColumns(); final List<Column> columnHeaderList = columnsHeader.getColumn(); tableExtractionUtility.createColumnHeaders(columnHeaderList, tableColumnsInfoList); final List<TableExtractionRule> tableExtractionRuleList = tableInfo.getTableExtractionRules(); final CurrencyCode tableCurrencyCode = tableInfo.getCurrencyCode(); final String ruleOperatorDefinedForTable = tableInfo.getRuleOperator(); final boolean isRemoveInvalidRows = tableInfo.isRemoveInvalidRows(); final List<TableRuleInfo> tableValidationRulesList = tableInfo.getTableRuleInfo(); LOGGER.debug(EphesoftStringUtil.concatenate("Currency for the table is: ", tableCurrencyCode, "\n Rule Operator for table is: ", ruleOperatorDefinedForTable, "\n Do we need to remove invalid rows?: ", isRemoveInvalidRows)); List<LineDataCarrier> lineDataCarrierList = null; Map<String, DataCarrier> colHeaderInfoMap = null; Map<String, CurrencyCode> columnCurrencyMap = null; List<TableColumnVO> tableColumnVOList = null; int noOfTableColumns = tableColumnsInfoList.size(); List<TableExtractionResult> allTableRowsList = new ArrayList<TableExtractionResult>( tableExtractionRuleList.size()); if (!CollectionUtils.isEmpty(tableExtractionRuleList)) { for (TableExtractionRule tableExtractionRule : tableExtractionRuleList) { if (null != tableExtractionRule) { final String startPattern = tableExtractionRule.getStartPattern(); if (EphesoftStringUtil.isNullOrEmpty(startPattern)) { LOGGER.error("Rule: ", tableExtractionRule.getRuleName(), "'s start pattern is null or empty."); continue; } final String endPattern = tableExtractionRule.getEndPattern(); TableExtractionAPI tableExtractionAPI = tableExtractionUtility .getTableExtractionAPI(tableExtractionRule.getTableAPI()); final List<TableColumnExtractionRule> tableColumnExtractionRuleList = tableExtractionRule .getTableColumnExtractionRules(); tableColumnVOList = tableExtractionUtility.getTableColumnData(tableColumnsInfoList, tableColumnVOList, noOfTableColumns, tableColumnExtractionRuleList); lineDataCarrierList = TableRowFinderUtility .searchAllRowOfTablesForTableExtractionWebServvice(pageList, startPattern, endPattern, tableFinderService.getFuzzyMatchThresholdValue()); if (CollectionUtils.isNotEmpty(lineDataCarrierList)) { boolean colHeaderValidationRequired = tableExtractionAPI .isColHeaderValidationRequired(); boolean colCoordValidationRequired = tableExtractionAPI.isColCoordValidationRequired(); if (colHeaderValidationRequired) { colHeaderInfoMap = tableExtractionUtility.getColumnHeaderMap(colHeaderInfoMap, tableColumnVOList); tableExtractionUtility.setColumnHeaderInfo(lineDataCarrierList, colHeaderInfoMap, tableColumnVOList, tableFinderService.getFuzzyMatchThresholdValue()); } boolean isTableColumnListDirty = tableExtractionUtility .sortTableColumnsInOrderOfOccurance(colHeaderInfoMap, tableColumnVOList, colHeaderValidationRequired, colCoordValidationRequired); final List<Row> rowList = addDataTablesValues(lineDataCarrierList, tableColumnVOList, colHeaderInfoMap, tableExtractionAPI, gapBetweenColumnWords); if (CollectionUtils.isNotEmpty(rowList)) { columnCurrencyMap = tableExtractionUtility.getColumnCurrencyMap(tableCurrencyCode, columnCurrencyMap, noOfTableColumns, tableColumnExtractionRuleList); applyTableValidationRulesOnRows(ruleOperatorDefinedForTable, tableValidationRulesList, isRemoveInvalidRows, columnCurrencyMap, rowList); if (null != columnCurrencyMap) { columnCurrencyMap.clear(); } allTableRowsList .add(new TableExtractionResult(rowList, tableExtractionRule.getRuleName())); } if (isTableColumnListDirty) { for (TableExtractionResult list : allTableRowsList) { if (null != list) { tableExtractionUtility.sortTableRowsInUiOrder(tableColumnsInfoList, list.getRowList()); } } } if (null != colHeaderInfoMap) { colHeaderInfoMap.clear(); } lineDataCarrierList.clear(); } if (null != tableColumnVOList) { tableColumnVOList.clear(); } } } } tableExtractionUtility.addBestRuleRows(dataTable, allTableRowsList); List<Row> dataTableRowList = dataTable.getRows().getRow(); if (CollectionUtil.isEmpty(dataTableRowList)) { insertEmptyRow(dataTableRowList, tableColumnsInfoList); applyTableValidationRulesOnRows(ruleOperatorDefinedForTable, tableValidationRulesList, isRemoveInvalidRows, columnCurrencyMap, dataTableRowList); } } return true; } public void extractFieldsForTestExtraction(String folderLocation, String batchClassIdentifier, Batch batch) throws DCMAApplicationException { final String switchValue = batchClassPluginPropertiesService.getPropertyValue(batchClassIdentifier, TABLE_EXTRACTION_PLUGIN, TableExtractionProperties.TABLE_EXTRACTION_SWITCH); if (null != switchValue) { setInvalidRuleCharacters(tableFinderService.getInvalidRuleCharacters()); final int gapBetweenColumnWords = tableFinderService.getGapBetweenColumnWords(); LOGGER.info("batchClassIdentifier : " + batchClassIdentifier); try { final List<Document> docTypeList = batch.getDocuments().getDocument(); if (null == docTypeList) { LOGGER.info("In valid batch documents."); } else { performTableExtraction(folderLocation, batchClassIdentifier, batch, gapBetweenColumnWords, docTypeList); } } catch (final DCMAApplicationException e) { LOGGER.error(e.getMessage()); throw new DCMAApplicationException(e.getMessage(), e); } catch (final Exception e) { LOGGER.error(e.getMessage()); throw new DCMAApplicationException(e.getMessage(), e); } batchSchemaService.updateBatch(batch); } else { LOGGER.info("Skipping Table extraction. Switch set as off."); } } /** * Performs table extraction * * @param folderLocation * @param batchClassIdentifier * @param batch * @param gapBetweenColumnWords * @param docTypeList * @throws DCMAApplicationException */ private void performTableExtraction(String folderLocation, String batchClassIdentifier, Batch batch, int gapBetweenColumnWords, List<Document> docTypeList) throws DCMAApplicationException { if (null != batch && null != docTypeList) { //BatchClass batchClass = batchClassService.getBatchClassByIdentifier(batchClassIdentifier); Map<Document, List<HocrPages>> documentToHOCRMapping = new LinkedHashMap<Document, List<HocrPages>>(); List<DocumentType> documentTypeList = documentTypeService .getDocTypeByBatchClassIdentifier(batchClassIdentifier); List<String> documentTypeName = new ArrayList<String>(); if (null != documentTypeList) { for (DocumentType docType : documentTypeList) { documentTypeName.add(docType.getName()); } } if (null != documentTypeList) { for (Document document : docTypeList) { if (null != document) { List<Page> pages = document.getPages().getPage(); if (null != pages) { List<HocrPages> hocrPageList = new ArrayList<HocrPages>(); for (Page page : pages) { HocrPages hocrPage = batchSchemaService.getHocrPagesForTestContent(folderLocation, page.getHocrFileName()); if (null != hocrPage) { hocrPageList.add(hocrPage); } } documentToHOCRMapping.put(document, hocrPageList); } } } if (null != documentToHOCRMapping) { Batch.Documents documents = processDocPageForTestExtraction(gapBetweenColumnWords, documentToHOCRMapping, batchClassIdentifier); if (null != documents) { mergeDocuments(documents, batch); } } } } } private void mergeDocuments(Documents documents, Batch batch) { if (null != documents.getDocument()) { for (Document document : documents.getDocument()) { if (null != document) { for (Document batchDocument : batch.getDocuments().getDocument()) { if (batchDocument.getIdentifier().equals(document.getIdentifier())) { if (null != document.getDataTables()) { batchDocument.setDataTables(document.getDataTables()); break; } } } } } } } }