Java tutorial
/* * ARX: Powerful Data Anonymization * Copyright (C) 2014 Karol Babioch <karol@babioch.de> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.deidentifier.arx.io; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.deidentifier.arx.io.ImportConfigurationExcel.ExcelFileTypes; /** * Import adapter for Excel files * * This adapter can import data from Excel files. It handles both XLS and XLSX * files. The file type itself is defined by {@link ImportConfigurationExcel}. * The files are accessed using Apache POI. * * @see <a href="https://poi.apache.org/">Aapache POI</a> * * @author Karol Babioch * @author Fabian Prasser */ public class ImportAdapterExcel extends ImportAdapter { /** * The configuration describing the Excel file */ private ImportConfigurationExcel config; /** * Actual iterator used to go through data */ private Iterator<Row> iterator; /** * Contains the last row as returned by the iterator * * @note This row cannot be simply returned, but needs to be further * processed, e.g. to return only selected columns. */ private Row row; /** * Indicates whether the first row has already been returned * * The first row contains the name of the columns and always needs to be * returned first in order to guarantee that the framework will pick up the * names correctly. */ private boolean headerReturned = false; /** * Number of rows within the specified sheet */ private int totalRows; /** * Current row {@link lastRow} is referencing */ private int currentRow = 0; /** * Holds the number of columns * * This is set in the first iteration and is checked against in every other * iteration. Once a row contains more columns that this, an exception is * thrown. */ private int numberOfColumns; private FileInputStream input; /** * Creates a new instance of this object with given configuration * * Depending upon the file type it either uses HSSF or XSSF to access the * file. In both cases {@link #iterator} will be assigned a reference to * an iterator, which can then be used to access the actual data on a row by * row basis. * * @param config * {@link #config} * * @throws IOException * In case file doesn't contain actual data */ protected ImportAdapterExcel(ImportConfigurationExcel config) throws IOException { super(config); this.config = config; /* Get row iterator */ input = new FileInputStream(config.getFileLocation()); Workbook workbook = null; if (config.getExcelFileType() == ExcelFileTypes.XLS) { workbook = new HSSFWorkbook(input); } else if (config.getExcelFileType() == ExcelFileTypes.XLSX) { workbook = new XSSFWorkbook(input); } else { input.close(); throw new IllegalArgumentException("File type not supported"); } workbook.setMissingCellPolicy(Row.CREATE_NULL_AS_BLANK); Sheet sheet = workbook.getSheetAt(config.getSheetIndex()); iterator = sheet.iterator(); /* Get total number of rows */ totalRows = sheet.getPhysicalNumberOfRows(); /* Check whether there is actual data within the file */ if (iterator.hasNext()) { row = iterator.next(); if (config.getContainsHeader()) { if (!iterator.hasNext()) { throw new IOException("File contains nothing but header"); } } } else { throw new IOException("File contains no data"); } // Create header header = createHeader(); } /** * Returns the percentage of data that has already been returned * * The basis for this calculation is the row currently being accessed. * * @see {@link #currentRow} * @see {@link #totalRows} */ @Override public int getProgress() { return (int) ((double) currentRow / (double) totalRows * 100d); } /** * Indicates whether there is another element to return * * This returns true when the file contains another line, which could be * accessed by {@link #iterator}. * * @note {@link #row} effectively works as buffer and will always be set * up by the previous iteration, so once there is no data, it will be * assigned <code>null</code>, which is checked for here. */ @Override public boolean hasNext() { return row != null; } /** * Returns the next row * * The returned element is sorted as defined by {@link ImportColumn#index} * and contains as many elements as there are columns selected to import * from {@link #indexes}. The first row will always contain the names of the * columns. {@link #headerReturned} is used to keep track of that. * * @throws IllegalArgumentException * In case defined datatypes don't match */ @Override public String[] next() { /* Check whether header was already returned */ if (!headerReturned) { headerReturned = true; return header; } /* Check whether number of columns is too big */ if (row.getPhysicalNumberOfCells() > numberOfColumns) { throw new IllegalArgumentException("Number of columns in row " + currentRow + " is too big"); } /* Create regular row */ String[] result = new String[indexes.length]; for (int i = 0; i < indexes.length; i++) { row.getCell(indexes[i]).setCellType(Cell.CELL_TYPE_STRING); result[i] = row.getCell(indexes[i]).getStringCellValue(); if (!dataTypes[i].isValid(result[i])) { throw new IllegalArgumentException("Data value does not match data type"); } } /* Fetches the next row, which will be used in next iteration */ if (iterator.hasNext()) { row = iterator.next(); currentRow++; } else { row = null; try { input.close(); } catch (Exception e) { /* Die silently*/ } } /* Return resulting row */ return result; } /** * Dummy */ @Override public void remove() { throw new UnsupportedOperationException(); } /** * Creates the header row * * This returns a string array with the names of the columns that will be * returned later on by iterating over this object. Depending upon the * configuration {@link ImportConfigurationExcel#getContainsHeader()} and * whether or not names have been assigned explicitly either the appropriate * values will be returned, or names will be made up on the fly following * the pattern "Column #x", where x is incremented for each column. */ private String[] createHeader() { /* Preparation work */ if (config.getContainsHeader()) this.config.prepare(row); this.indexes = getIndexesToImport(); this.dataTypes = getColumnDatatypes(); /* Initialization */ String[] header = new String[config.getColumns().size()]; List<ImportColumn> columns = config.getColumns(); /* Create header */ for (int i = 0, len = columns.size(); i < len; i++) { ImportColumn column = columns.get(i); row.getCell(((ImportColumnExcel) column).getIndex()).setCellType(Cell.CELL_TYPE_STRING); String name = row.getCell(((ImportColumnExcel) column).getIndex()).getStringCellValue(); if (config.getContainsHeader() && !name.equals("")) { /* Assign name of file itself */ header[i] = name; } else { /* Nothing defined in header (or empty), build name manually */ header[i] = "Column #" + ((ImportColumnExcel) column).getIndex(); } if (column.getAliasName() != null) { /* Name has been assigned explicitly */ header[i] = column.getAliasName(); } column.setAliasName(header[i]); } /* Fetch next row in preparation for next iteration */ if (config.getContainsHeader()) { if (iterator.hasNext()) { row = iterator.next(); currentRow++; } else { row = null; } } /* Store number of columns */ numberOfColumns = header.length; /* Return header */ return header; } /** * Returns an array with indexes of columns that should be imported * * Only columns listed within {@link #column} will be imported. This * iterates over the list of columns and returns an array with indexes of * columns that should be imported. * * @return Array containing indexes of columns that should be imported */ protected int[] getIndexesToImport() { /* Get indexes to import from */ ArrayList<Integer> indexes = new ArrayList<Integer>(); for (ImportColumn column : config.getColumns()) { indexes.add(((ImportColumnExcel) column).getIndex()); } int[] result = new int[indexes.size()]; for (int i = 0; i < result.length; i++) { result[i] = indexes.get(i); } return result; } }