Java tutorial
/** * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code is "EINRC-7 / GDEM project". * * The Initial Developer of the Original Code is TietoEnator. * The Original Code code was developed for the European * Environment Agency (EEA) under the IDA/EINRC framework contract. * * Copyright (C) 2000-2004 by European Environment Agency. All * Rights Reserved. * * Original Code: Enriko Ksper (TietoEnator) */ package eionet.gdem.conversion.excel.reader; import eionet.gdem.GDEMException; import eionet.gdem.conversion.datadict.DDElement; import eionet.gdem.conversion.datadict.DD_XMLInstance; import eionet.gdem.conversion.spreadsheet.DDXMLConverter; import eionet.gdem.conversion.spreadsheet.SourceReaderIF; import eionet.gdem.conversion.spreadsheet.SourceReaderLogger; import eionet.gdem.conversion.spreadsheet.SourceReaderLogger.ReaderTypeEnum; import eionet.gdem.dto.ConversionResultDto; import eionet.gdem.utils.Utils; import org.apache.commons.lang3.StringUtils; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.FormulaEvaluator; import org.apache.poi.ss.usermodel.RichTextString; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import java.io.File; import java.io.FileInputStream; import java.util.ArrayList; import java.util.Date; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; /** * The main class, which is calling POI HSSF methods for reading Excel file. * * @author Enriko Ksper */ public class ExcelReader implements SourceReaderIF { /** * Excel eorkbook to be converted. */ private Workbook wb = null; /** * Excel sheet name where Data Dictionary writes XML Schema information. */ private static final String SCHEMA_SHEET_NAME = "DO_NOT_DELETE_THIS_SHEET"; /** * Default date format pattern. */ private static final String DEFAULT_DATE_FORMAT = "yyyy-MM-dd"; /** * Date formatter. */ private static DataFormatter formatter = new DataFormatter(new Locale("en", "US")); /** * Boolean indicates if Excel file is in 2007 version format or not. */ private boolean isExcel2007 = false; /** * Logger object for writing conversion log. */ private SourceReaderLogger readerLogger; /** * List of Excel sheet names. */ private List<String> excelSheetNames = new ArrayList<String>(); /** * Excel file size. */ private long inputFileLength = 0; /** * Formula evaluator used for calculating formulas in cell. */ private FormulaEvaluator evaluator; /** * Non-breaking space unicode. */ final char NON_BREAKING_SPACE = 0x00A0; /** * Class constructor. * * @param excel2007 true if the file is Excel 2007 or newer. */ public ExcelReader(boolean excel2007) { isExcel2007 = excel2007; } @Override public void initReader(File inputFile) throws GDEMException { if (inputFile == null) { throw new GDEMException("Input file is missing"); } try { if (!isExcel2007) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(inputFile)); wb = new HSSFWorkbook(fs); } else { OPCPackage p = OPCPackage.open(new FileInputStream(inputFile)); wb = WorkbookFactory.create(p); } } catch (Exception e) { throw new GDEMException("ErrorConversionHandler - couldn't open Excel file: " + e.toString()); } inputFileLength = inputFile.length(); evaluator = wb.getCreationHelper().createFormulaEvaluator(); } @Override public void startReader(ConversionResultDto resultObject) { readerLogger = new SourceReaderLogger(resultObject, ReaderTypeEnum.EXCEL); readerLogger.logStartWorkbook(); excelSheetNames = getSheetNames(); readerLogger.logNumberOfSheets(wb.getNumberOfSheets(), StringUtils.join(excelSheetNames, ", ")); } @Override public void closeReader() { readerLogger.logEndWorkbook(inputFileLength); } @Override public String getXMLSchema() { if (wb == null) { return null; } Sheet schemaSheet = wb.getSheet(SCHEMA_SHEET_NAME); if (schemaSheet == null) { for (int i = 0; i < wb.getNumberOfSheets(); i++) { schemaSheet = wb.getSheetAt(i); String schema = findSchemaFromSheet(schemaSheet); if (schema != null) { return schema; } } } else { return findSchemaFromSheet(schemaSheet); } return null; } @Override public String getFirstSheetName() { if (wb == null) { return null; } for (int i = 0; i < wb.getNumberOfSheets(); i++) { String sheetName = wb.getSheetName(i).trim(); if (sheetName.equalsIgnoreCase(SCHEMA_SHEET_NAME)) { continue; } return sheetName; } return null; } @Override public Map<String, String> getSheetSchemas() { if (wb == null) { return null; } Sheet schemaSheet = wb.getSheet(SCHEMA_SHEET_NAME); if (schemaSheet == null) { for (int i = 0; i < wb.getNumberOfSheets(); i++) { schemaSheet = wb.getSheetAt(i); Map<String, String> schemas = findSheetSchemas(schemaSheet); if (schemas != null) { if (!schemas.isEmpty()) { return schemas; } } } } else { return findSheetSchemas(schemaSheet); } return null; } @Override public void writeContentToInstance(DD_XMLInstance instance) throws Exception { List<DDXmlElement> tables = instance.getTables(); if (tables == null || wb == null) { readerLogger.logNoDefinitionsForTables(); return; } for (int i = 0; i < tables.size(); i++) { DDXmlElement table = tables.get(i); String tblLocalName = table.getLocalName(); if (tblLocalName != null && tblLocalName.length() > 31) { tblLocalName = tblLocalName.substring(0, 31); } String tblName = table.getName(); String tblAttrs = table.getAttributes(); readerLogger.logStartSheet(tblLocalName); readerLogger.logSheetSchema(instance.getInstanceUrl(), tblLocalName); if (!excelSheetNames.contains(tblLocalName)) { readerLogger.logSheetNotFound(tblLocalName); } Sheet sheet = getSheet(tblLocalName); Sheet metaSheet = getMetaSheet(tblLocalName); if (sheet == null) { readerLogger.logEmptySheet(tblLocalName); continue; } int firstRow = sheet.getFirstRowNum(); int lastRow = sheet.getLastRowNum(); Row row = sheet.getRow(firstRow); Row metaRow = null; List<DDXmlElement> elements = instance.getTblElements(tblName); setColumnMappings(row, elements, true); if (metaSheet != null) { metaRow = metaSheet.getRow(firstRow); setColumnMappings(metaRow, elements, false); } try { logColumnMappings(tblLocalName, row, metaRow, elements); } catch (Exception e) { e.printStackTrace(); readerLogger.logSystemWarning(tblLocalName, "cannot write log about missing or ectra columns."); } instance.writeTableStart(tblName, tblAttrs); instance.setCurRow(tblName); Map<String, DDElement> elemDefs = instance.getElemDefs(tblLocalName); // read data // there are no data rows in the Excel file. We create empty table firstRow = (firstRow == lastRow) ? lastRow : firstRow + 1; int countRows = 0; for (int j = firstRow; j <= lastRow; j++) { row = (firstRow == 0) ? null : sheet.getRow(j); metaRow = (metaSheet != null && firstRow != 0) ? metaSheet.getRow(j) : null; // don't convert empty rows. if (isEmptyRow(row)) { continue; } countRows++; instance.writeRowStart(); for (int k = 0; k < elements.size(); k++) { DDXmlElement elem = elements.get(k); String elemName = elem.getName(); String elemLocalName = elem.getLocalName(); String elemAttributes = elem.getAttributes(); int colIdx = elem.getColIndex(); boolean isMainTable = elem.isMainTable(); String schemaType = null; boolean hasMultipleValues = false; String delim = null; // get element definition info if (elemDefs != null && elemDefs.containsKey(elemLocalName)) { schemaType = elemDefs.get(elemLocalName).getSchemaDataType(); delim = elemDefs.get(elemLocalName).getDelimiter(); hasMultipleValues = elemDefs.get(elemLocalName).isHasMultipleValues(); } String data = ""; if (colIdx > -1) { data = (isMainTable) ? getCellValue(row, colIdx, schemaType) : getCellValue(metaRow, colIdx, null); } if (hasMultipleValues && !Utils.isNullStr(delim)) { String[] values = data.split(delim); for (String value : values) { instance.writeElement(elemName, elemAttributes, value.trim()); } } else { instance.writeElement(elemName, elemAttributes, data); } } instance.writeRowEnd(); } instance.writeTableEnd(tblName); readerLogger.logNumberOfRows(countRows, tblLocalName); readerLogger.logEndSheet(tblLocalName); } } /** * Check if row is empty or not. * * @param row MS Excel row. * @return boolean */ public boolean isEmptyRow(Row row) { if (row == null) { return true; } for (int j = 0; j <= row.getLastCellNum(); j++) { Cell cell = row.getCell(j); if (cell == null) { continue; } if (!Utils.isNullStr(cellValueToString(cell, null))) { return false; } } return true; } @Override public boolean isEmptySheet(String sheetName) { Sheet sheet = getSheet(sheetName); int rowCount = sheet.getLastRowNum(); if (rowCount < 1) { return true; } // check if the first row has any data for (int i = 1; i <= rowCount; i++) { Row row = sheet.getRow(i); if (isEmptyRow(row)) { continue; } else { return false; } } return true; } /** * Method goes through 4 rows and search the best fit of XML Schema. The deault row is 4. * * @param schemaSheet Schema sheet name. * @return schema URL. */ private String findSchemaFromSheet(Sheet schemaSheet) { Row schemaRow = null; Cell schemaCell = null; for (int i = 3; i > -1; i--) { if (schemaSheet.getLastRowNum() < i) { continue; } schemaRow = schemaSheet.getRow(i); if (schemaRow == null) { continue; } if (schemaRow.getLastCellNum() < 0) { continue; } schemaCell = schemaRow.getCell(0); String val = schemaCell.getRichStringCellValue().toString(); if (val.startsWith("http://") && val.toLowerCase().indexOf("/getschema") > 0 && Utils.isURL(val)) { return val; } } return null; } /** * Method goes through rows after XML Schema and finds schemas for Excel sheets (DataDict tables). cell(0) =sheet name; * cell(1)=XML schema * * @param schemaSheet sheet name * @return Map */ private Map<String, String> findSheetSchemas(Sheet schemaSheet) { Row schemaRow = null; Cell schemaCell = null; Cell sheetCell = null; Map<String, String> result = new LinkedHashMap<String, String>(); if (schemaSheet.getLastRowNum() < 1) { return null; } for (int i = 0; i <= schemaSheet.getLastRowNum(); i++) { schemaRow = schemaSheet.getRow(i); if (schemaRow == null) { continue; } if (schemaRow.getLastCellNum() < 1) { continue; } schemaCell = schemaRow.getCell(1); if (schemaCell == null) { continue; } String schemaValue = schemaCell.getRichStringCellValue().toString(); if (schemaValue.startsWith("http://") && schemaValue.toLowerCase().indexOf("/getschema") > 0 && Utils.isURL(schemaValue)) { sheetCell = schemaRow.getCell(0); String sheetValue = sheetCell.getRichStringCellValue().toString(); if (sheetValue == null) { continue; } if (sheetValue != null && sheetValue.length() > 31) { sheetValue = sheetValue.substring(0, 31); } Sheet sheet = getSheet(sheetValue); if (sheet != null && !result.containsKey(sheetValue)) { result.put(sheetValue, schemaValue); } } } return result; } /** * Get Sheet object by sheet name. * * @param name sheet name * @return Sheet */ private Sheet getSheet(String name) { Sheet sheet = wb.getSheet(name.trim()); if (sheet == null) { for (int i = 0; i < wb.getNumberOfSheets(); i++) { String sheetName = wb.getSheetName(i); if (sheetName.trim().equalsIgnoreCase(name.trim())) { return wb.getSheet(sheetName); } } } else { return sheet; } return null; } /** * Returns the list of MS Excel sheet names. * * @return List of sheet names. */ private List<String> getSheetNames() { List<String> list = new ArrayList<String>(); for (int i = 0; i < wb.getNumberOfSheets(); i++) { String sheetName = wb.getSheetName(i); list.add(sheetName); } return list; } /** * Reads cell value and formats it according to element type defined in XML Schema. If the cell contains formula, * then calculated value is returned. * * @param cell Spreadsheet Cell object. * @param schemaType XML Schema data type for given cell. * @return string value of the cell. */ protected String cellValueToString(Cell cell, String schemaType) { String value = ""; if (cell != null) { switch (evaluator.evaluateInCell(cell).getCellType()) { case HSSFCell.CELL_TYPE_NUMERIC: if (HSSFDateUtil.isCellDateFormatted(cell) && !isYearValue(cell.getNumericCellValue())) { Date dateValue = cell.getDateCellValue(); value = Utils.getFormat(dateValue, DEFAULT_DATE_FORMAT); } else if (HSSFDateUtil.isValidExcelDate(cell.getNumericCellValue()) && schemaType != null && schemaType.equals("xs:date") && !isYearValue(cell.getNumericCellValue())) { Date dateValue = cell.getDateCellValue(); value = Utils.getFormat(dateValue, DEFAULT_DATE_FORMAT); } else { value = formatter.formatCellValue(cell); } break; case HSSFCell.CELL_TYPE_STRING: RichTextString richText = cell.getRichStringCellValue(); value = richText.toString(); break; case HSSFCell.CELL_TYPE_BOOLEAN: value = Boolean.toString(cell.getBooleanCellValue()); break; case HSSFCell.CELL_TYPE_ERROR: break; case HSSFCell.CELL_TYPE_FORMULA: break; default: break; } } return StringUtils.strip(value.trim(), String.valueOf(NON_BREAKING_SPACE)).trim(); } /** * DD can generate additional "-meta" sheets with GIS elements for one DD table. In XML these should be handled as 1 table. This * is method for finding these kind of sheets and parsing these in parallel with the main sheet * * @param mainSheetName Main sheet name. * @return Spreadsheet Sheet object. */ private Sheet getMetaSheet(String mainSheetName) { return getSheet(mainSheetName + DDXMLConverter.META_SHEET_NAME); } /** * Read column header. * * @param row Excel row object * @param elements List of DD table elements * @param mainTable true if the table is main table. */ private void setColumnMappings(Row row, List<DDXmlElement> elements, boolean mainTable) { if (row == null || elements == null) { return; } int firstCell = row.getFirstCellNum(); int lastCell = row.getLastCellNum(); for (int j = 0; j < elements.size(); j++) { DDXmlElement elem = elements.get(j); String elemLocalName = elem.getLocalName(); for (int k = firstCell; k < lastCell; k++) { Cell cell = row.getCell(k); String colName = cellValueToString(cell, null); colName = colName != null ? colName.trim() : ""; if (colName.equalsIgnoreCase(elemLocalName)) { elem.setColIndex(k); elem.setMainTable(mainTable); break; } } } } /** * Goes through all columns and logs missing and redundant columns into conversion log. * * @param sheetName Excel sheet name. * @param row Excel Row object * @param metaRow Excel meta sheet row * @param elements List of XML elements */ private void logColumnMappings(String sheetName, Row row, Row metaRow, List<DDXmlElement> elements) { int nofColumns = row.getLastCellNum() - row.getFirstCellNum(); readerLogger.logNumberOfColumns(nofColumns, sheetName); if (metaRow != null) { int nofMetaColumns = row.getLastCellNum() - row.getFirstCellNum(); readerLogger.logNumberOfColumns(nofMetaColumns, sheetName + DDXMLConverter.META_SHEET_NAME); } List<String> missingColumns = new ArrayList<String>(); List<String> elemNames = new ArrayList<String>(); for (DDXmlElement element : elements) { if (element.getColIndex() < 0) { missingColumns.add(element.getLocalName()); } elemNames.add(element.getLocalName().toLowerCase()); } if (missingColumns.size() > 0) { readerLogger.logMissingColumns(StringUtils.join(missingColumns, ", "), sheetName); } List<String> extraColumns = getExtraColumns(sheetName, row, elemNames); if (extraColumns.size() > 0) { readerLogger.logExtraColumns(StringUtils.join(extraColumns, ", "), sheetName); } if (metaRow != null) { List<String> extraMetaColumns = getExtraColumns(sheetName, metaRow, elemNames); if (extraMetaColumns.size() > 0) { readerLogger.logExtraColumns(StringUtils.join(extraColumns, ", "), sheetName + DDXMLConverter.META_SHEET_NAME); } } } /** * Find redundant columns from the list of columns. * * @param sheetName Excel sheet name. * @param row Excel row. * @param elemNames DD element names. * @return List of extra columns added to sheet. */ private List<String> getExtraColumns(String sheetName, Row row, List<String> elemNames) { List<String> extraColumns = new ArrayList<String>(); List<Integer> emptyColumns = new ArrayList<Integer>(); for (int k = row.getFirstCellNum(); k < row.getLastCellNum(); k++) { Cell cell = row.getCell(k); String colName = (cell != null) ? cellValueToString(cell, null) : null; colName = colName != null ? colName.trim() : ""; if (colName.equals("")) { emptyColumns.add(k); } else if (!Utils.isNullStr(colName) && !elemNames.contains(colName.toLowerCase())) { extraColumns.add(colName); } } if (emptyColumns.size() > 0) { readerLogger.logInfo(sheetName, "Found data from column(s): " + StringUtils.join(emptyColumns, ", ") + ", but no column heading is available. The column(s) will be ignored."); } return extraColumns; } /** * Get cell String value. * * @param row Excel row. * @param colIdx Column index * @param schemaType Schema type * @return Textual cell value. */ private String getCellValue(Row row, Integer colIdx, String schemaType) { Cell cell = (colIdx == null || row == null) ? null : row.getCell(colIdx); String data = (cell == null) ? "" : cellValueToString(cell, schemaType); return data; } /** * If date formatted cell value is not higher than 4 digit number, then it is probably a year. * * @param doubleCellValue Numeric cell value. * @return boolean is year value. */ private boolean isYearValue(double doubleCellValue) { return doubleCellValue < 3000 && doubleCellValue > 0; } /** * Return Workbook object. * * @return Workbook object. */ protected Workbook getWorkbook() { return this.wb; } }