Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package uk.ac.leeds.ccg.andyt.projects.pfi; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.FilenameFilter; import java.io.IOException; import java.io.StreamTokenizer; import java.util.ArrayList; import java.util.ListIterator; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.FormulaEvaluator; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import uk.ac.leeds.ccg.andyt.generic.io.Generic_StaticIO; /** * Class adapted from: * http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/ss/examples/ToCSV.java * Demonstrates <em>one</em> way to convert an Excel spreadsheet into a CSV * file. This class makes the following assumptions; * <list> * <li>1. Where the Excel workbook contains more that one worksheet, then a * single CSV file will contain the data from all of the worksheets.</li> * <li>2. The data matrix contained in the CSV file will be square. This means * that the number of fields in each record of the CSV file will match the * number of cells in the longest row found in the Excel workbook. Any short * records will be 'padded' with empty fields - an empty field is represented in * the the CSV file in this way - ,,.</li> * <li>3. Empty fields will represent missing cells.</li> * <li>4. A record consisting of empty fields will be used to represent an empty * row in the Excel workbook.</li> * </list> * Therefore, if the worksheet looked like this; * * <pre> * ___________________________________________ * | | | | | | * | A | B | C | D | E | * ___|_______|_______|_______|_______|_______| * | | | | | | * 1 | 1 | 2 | 3 | 4 | 5 | * ___|_______|_______|_______|_______|_______| * | | | | | | * 2 | | | | | | * ___|_______|_______|_______|_______|_______| * | | | | | | * 3 | | A | | B | | * ___|_______|_______|_______|_______|_______| * | | | | | | * 4 | | | | | Z | * ___|_______|_______|_______|_______|_______| * | | | | | | * 5 | 1,400 | | 250 | | | * ___|_______|_______|_______|_______|_______| * * </pre> * * Then, the resulting CSV file will contain the following lines (records); * <pre> * 1,2,3,4,5 * ,,,, * ,A,,B, * ,,,,Z * "1,400",,250,, * </pre><p> * Typically, the comma is used to separate each of the fields that, together, * constitute a single record or line within the CSV file. This is not however a * hard and fast rule and so this class allows the user to determine which * character is used as the field separator and assumes the comma if none other * is specified. * </p><p> * If a field contains the separator then it will be escaped. If the file should * obey Excel's CSV formatting rules, then the field will be surrounded with * speech marks whilst if it should obey UNIX conventions, each occurrence of * the separator will be preceded by the backslash character. * </p><p> * If a field contains an end of line (EOL) character then it too will be * escaped. If the file should obey Excel's CSV formatting rules then the field * will again be surrounded by speech marks. On the other hand, if the file * should follow UNIX conventions then a single backslash will precede the EOL * character. There is no single applicable standard for UNIX and some * appications replace the CR with \r and the LF with \n but this class will not * do so. * </p><p> * If the field contains double quotes then that character will be escaped. It * seems as though UNIX does not define a standard for this whilst Excel does. * Should the CSV file have to obey Excel's formmating rules then the speech * mark character will be escaped with a second set of speech marks. Finally, an * enclosing set of speah marks will also surround the entire field. Thus, if * the following line of text appeared in a cell - "Hello" he said - it would * look like this when converted into a field within a CSV file - """Hello"" he * said". * </p><p> * Finally, it is worth noting that talk of CSV 'standards' is really slightly * missleading as there is no such thing. It may well be that the code in this * class has to be modified to produce files to suit a specific application or * requirement. * </p> * * @author Mark B * @version 1.00 9th April 2010 1.10 13th April 2010 - Added support for * processing all Excel workbooks in a folder along with the ability to specify * a field separator character. 2.00 14th April 2010 - Added support for * embedded characters; the field separator, EOL and double quotes or speech * marks. In addition, gave the client the ability to select how these are * handled, either obeying Excel's or UNIX formatting conventions. */ public class XSLX2CSV { //private static ArrayList<String> headers = new ArrayList<String>(); private Workbook workbook = null; private ArrayList<ArrayList<ArrayList<String>>> csvData = null; private ArrayList<Integer> maxRowWidth = new ArrayList<Integer>(); private int formattingConvention = 0; private DataFormatter formatter = null; private FormulaEvaluator evaluator = null; private String separator = null; private static final String CSV_FILE_EXTENSION = ".csv"; private static final String DEFAULT_SEPARATOR = ","; /** * Identifies that the CSV file should obey Excel's formatting conventions * with regard to escaping certain embedded characters - the field * separator, speech mark and end of line (EOL) character */ public static final int EXCEL_STYLE_ESCAPING = 0; /** * Identifies that the CSV file should obey UNIX formatting conventions with * regard to escaping certain embedded characters - the field separator and * end of line (EOL) character */ public static final int UNIX_STYLE_ESCAPING = 1; /** * Process the contents of a folder, convert the contents of each Excel * workbook into CSV format and save the resulting file to the specified * folder using the same name as the original workbook with the .xls or * .xlsx extension replaced by .csv. This method will ensure that the CSV * file created contains the comma field separator and that embedded * characters such as the field separator, the EOL and double quotes are * escaped in accordance with Excel's convention. * * @param strSource An instance of the String class that encapsulates the * name of and path to either a folder containing those Excel workbook(s) or * the name of and path to an individual Excel workbook that is/are to be * converted. * @param strDestination An instance of the String class encapsulating the * name of and path to a folder that will contain the resulting CSV files. * @throws java.io.FileNotFoundException Thrown if any file cannot be * located on the filesystem during processing. * @throws java.io.IOException Thrown if the filesystem encounters any * problems during processing. * @throws java.lang.IllegalArgumentException Thrown if the values passed to * the strSource parameter refers to a file or folder that does not exist or * if the value passed to the strDestination paramater refers to a folder * that does not exist or simply does not refer to a folder. * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown * if the xml markup encountered whilst parsing a SpreadsheetML file (.xlsx) * is invalid. */ public void convertExcelToCSV(String strSource, String strDestination) throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException { // Simply chain the call to the overloaded convertExcelToCSV(String, // String, String, int) method, pass the default separator and ensure // that certain embedded characters are escaped in accordance with // Excel's formatting conventions this.convertExcelToCSV(strSource, strDestination, XSLX2CSV.DEFAULT_SEPARATOR, XSLX2CSV.EXCEL_STYLE_ESCAPING); } /** * Process the contents of a folder, convert the contents of each Excel * workbook into CSV format and save the resulting file to the specified * folder using the same name as the original workbook with the .xls or * .xlsx extension replaced by .csv. This method allows the client to define * the field separator but will ensure that embedded characters such as the * field separator, the EOL and double quotes are escaped in accordance with * Excel's convention. * * @param strSource An instance of the String class that encapsulates the * name of and path to either a folder containing those Excel workbook(s) or * the name of and path to an individual Excel workbook that is/are to be * converted. * @param strDestination An instance of the String class encapsulating the * name of and path to a folder that will contain the resulting CSV files. * @param separator An instance of the String class that encapsulates the * character or characters the client wishes to use as the field separator. * @throws java.io.FileNotFoundException Thrown if any file cannot be * located on the filesystem during processing. * @throws java.io.IOException Thrown if the filesystem encounters any * problems during processing. * @throws java.lang.IllegalArgumentException Thrown if the values passed to * the strSource parameter refers to a file or folder that does not exist or * if the value passed to the strDestination paramater refers to a folder * that does not exist or simply does not refer to a folder. * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown * if the xml markup encounetered whilst parsing a SpreadsheetML file * (.xlsx) is invalid. */ public void convertExcelToCSV(String strSource, String strDestination, String separator) throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException { // Simply chain the call to the overloaded convertExcelToCSV(String, // String, String, int) method and ensure that certain embedded // characters are escaped in accordance with Excel's formatting // conventions this.convertExcelToCSV(strSource, strDestination, separator, XSLX2CSV.EXCEL_STYLE_ESCAPING); } /** * Process the contents of a folder, convert the contents of each Excel * workbook into CSV format and save the resulting file to the specified * folder using the same name as the original workbook with the .xls or * .xlsx extension replaced by .csv * * @param strSource An instance of the String class that encapsulates the * name of and path to either a folder containing those Excel workbook(s) or * the name of and path to an individual Excel workbook that is/are to be * converted. * @param strDestination An instance of the String class encapsulating the * name of and path to a folder that will contain the resulting CSV files. * @param formattingConvention A primitive int whose value will determine * whether certain embedded characters should be escaped in accordance with * Excel's or UNIX formatting conventions. Two constants are defined to * support this option; ToCSV.EXCEL_STYLE_ESCAPING and * ToCSV.UNIX_STYLE_ESCAPING * @param separator An instance of the String class encapsulating the * characters or characters that should be used to separate items on a line * within the CSV file. * @throws java.io.FileNotFoundException Thrown if any file cannot be * located on the filesystem during processing. * @throws java.io.IOException Thrown if the filesystem encounters any * problems during processing. * @throws java.lang.IllegalArgumentException Thrown if the values passed to * the strSource parameter refers to a file or folder that does not exist, * if the value passed to the strDestination paramater refers to a folder * that does not exist, if the value passed to the strDestination parameter * does not refer to a folder or if the value passed to the * formattingConvention parameter is other than one of the values defined by * the constants ToCSV.EXCEL_STYLE_ESCAPING and ToCSV.UNIX_STYLE_ESCAPING. * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown * if the xml markup encounetered whilst parsing a SpreadsheetML file * (.xlsx) is invalid. */ public void convertExcelToCSV(String strSource, String strDestination, String separator, int formattingConvention) throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException { File source = new File(strSource); File destination = new File(strDestination); File[] filesList = null; // Check that the source file/folder exists. if (!source.exists()) { throw new IllegalArgumentException("The source for the Excel " + "file(s) cannot be found."); } // Ensure thaat the folder the user has chosen to save the CSV files // away into firstly exists and secondly is a folder rather than, for // instance, a data file. if (!destination.exists()) { throw new IllegalArgumentException( "The folder/directory for the " + "converted CSV file(s) does not exist."); } if (!destination.isDirectory()) { throw new IllegalArgumentException( "The destination for the CSV " + "file(s) is not a directory/folder."); } // Ensure the value passed to the formattingConvention parameter is // within range. if (formattingConvention != XSLX2CSV.EXCEL_STYLE_ESCAPING && formattingConvention != XSLX2CSV.UNIX_STYLE_ESCAPING) { throw new IllegalArgumentException( "The value passed to the " + "formattingConvention parameter is out of range."); } // Copy the spearator character and formatting convention into local // variables for use in other methods. this.separator = separator; this.formattingConvention = formattingConvention; // Check to see if the sourceFolder variable holds a reference to // a file or a folder full of files. if (source.isDirectory()) { // Get a list of all of the Excel spreadsheet files (workbooks) in // the source folder/directory filesList = source.listFiles(new ExcelFilenameFilter()); } else { // Assume that it must be a file handle - although there are other // options the code should perhaps check - and store the reference // into the filesList variable. filesList = new File[] { source }; } // Step through each of the files in the source folder and for each // open the workbook, convert it's contents to CSV format and then // save the resulting file away into the folder specified by the // contents of the destination variable. Note that the name of the // csv file will be created by taking the name of the Excel file, // removing the extension and replacing it with .csv. Note that there // is one drawback with this approach; if the folder holding the files // contains two workbooks whose names match but one is a binary file // (.xls) and the other a SpreadsheetML file (.xlsx), then the names // for both CSV files will be identical and one CSV file will, // therefore, over-write the other. for (File excelFile : filesList) { // Open the workbook this.openWorkbook(excelFile); // Convert it's contents into a CSV file this.convertToCSV(excelFile, destination); } } /** * Open an Excel workbook ready for conversion. * * @param file An instance of the File class that encapsulates a handle to a * valid Excel workbook. Note that the workbook can be in either binary * (.xls) or SpreadsheetML (.xlsx) format. * @throws java.io.FileNotFoundException Thrown if the file cannot be * located. * @throws java.io.IOException Thrown if a problem occurs in the file * system. * @throws org.apache.poi.openxml4j.exceptions.InvalidFormatException Thrown * if invalid xml is found whilst parsing an input SpreadsheetML file. */ private void openWorkbook(File file) throws FileNotFoundException, IOException, InvalidFormatException { FileInputStream fis = null; try { System.out.println("Opening workbook [" + file.getName() + "]"); fis = new FileInputStream(file); // Open the workbook and then create the FormulaEvaluator and // DataFormatter instances that will be needed to, respectively, // force evaluation of forumlae found in cells and create a // formatted String encapsulating the cells contents. this.workbook = WorkbookFactory.create(fis); this.evaluator = this.workbook.getCreationHelper().createFormulaEvaluator(); this.formatter = new DataFormatter(true); } finally { if (fis != null) { fis.close(); } } } /** * Called to convert the contents of the currently opened workbook into a * CSV file. */ private void convertToCSV(File excelFile, File destination) { Sheet sheet = null; Row row = null; int lastRowNum = 0; int firstRowNum = 0; this.csvData = new ArrayList<ArrayList<ArrayList<String>>>(); System.out.println("Converting files contents to CSV format."); // Discover how many sheets there are in the workbook.... int numSheets = this.workbook.getNumberOfSheets(); // and then iterate through them. for (int sheetid = 0; sheetid < numSheets; sheetid++) { this.csvData.add(new ArrayList<ArrayList<String>>()); // Get a reference to a sheet and check to see if it contains // any rows. sheet = this.workbook.getSheetAt(sheetid); if (sheet.getPhysicalNumberOfRows() > 0) { // Note down the index number of the bottom-most row and // then iterate through all of the rows on the sheet starting // from the very first row - number 1 - even if it is missing. // Recover a reference to the row and then call another method // which will strip the data from the cells and build lines // for inclusion in the resylting CSV file. lastRowNum = sheet.getLastRowNum(); firstRowNum = sheet.getFirstRowNum(); for (int j = firstRowNum; j <= lastRowNum; j++) { row = sheet.getRow(j); this.rowToCSV(row, sheetid); } } try { // Save the CSV file away using the newly constricted file name // and to the specified directory. this.saveCSVFile(destination); } catch (IOException ex) { Logger.getLogger(XSLX2CSV.class.getName()).log(Level.SEVERE, null, ex); } } } /** * Called to actually save the data recovered from the Excel workbook as a * CSV file. * * @param file An instance of the File class that encapsulates a handle * referring to the CSV file. * @throws java.io.FileNotFoundException Thrown if the file cannot be found. * @throws java.io.IOException Thrown to indicate and error occurred in the * underylying file system. */ private void saveCSVFile(File file) throws FileNotFoundException, IOException { FileWriter fw = null; BufferedWriter bw = null; ArrayList<String> line = null; StringBuffer buffer = null; String csvLineElement = null; //String header; int sheetid = 0; File outFile; ListIterator<ArrayList<ArrayList<String>>> ite; ite = this.csvData.listIterator(); while (ite.hasNext()) { ArrayList<ArrayList<String>> sheet; sheet = ite.next(); //header = headers.get(sheetid); outFile = new File(file, "Sheet" + sheetid + XSLX2CSV.CSV_FILE_EXTENSION); try { System.out.println("Saving the CSV file [" + outFile.getName() + "]"); // Open a writer onto the CSV file. fw = new FileWriter(outFile); bw = new BufferedWriter(fw); // if (header != null) { // bw.write(header); // bw.newLine(); // } // Step through the elements of the ArrayList that was used to hold // all of the data recovered from the Excel workbooks' sheets, rows // and cells. for (int i = 0; i < sheet.size(); i++) { buffer = new StringBuffer(); // Get an element from the ArrayList that contains the data for // the workbook. This element will itself be an ArrayList // containing Strings and each String will hold the data recovered // from a single cell. The for() loop is used to recover elements // from this 'row' ArrayList one at a time and to write the Strings // away to a StringBuffer thus assembling a single line for inclusion // in the CSV file. If a row was empty or if it was short, then // the ArrayList that contains it's data will also be shorter than // some of the others. Therefore, it is necessary to check within // the for loop to ensure that the ArrayList contains data to be // processed. If it does, then an element will be recovered and // appended to the StringBuffer. line = sheet.get(i); for (int j = 0; j < this.maxRowWidth.get(sheetid); j++) { if (line.size() > j) { csvLineElement = line.get(j); if (csvLineElement != null) { buffer.append(this.escapeEmbeddedCharacters(csvLineElement)); } } if (j < (this.maxRowWidth.get(sheetid) - 1)) { buffer.append(this.separator); } } // Once the line is built, write it away to the CSV file. bw.write(buffer.toString().trim()); // Condition the inclusion of new line characters so as to // avoid an additional, superfluous, new line at the end of // the file. if (i < (this.csvData.get(sheetid).size() - 1)) { bw.newLine(); } } } finally { if (bw != null) { bw.flush(); bw.close(); } } sheetid++; } } /** * Called to convert a row of cells into a line of data that can later be * output to the CSV file. * * @param row An instance of either the HSSFRow or XSSFRow classes that * encapsulates information about a row of cells recovered from an Excel * workbook. */ private void rowToCSV(Row row, int sheetid) { Cell cell = null; int lastCellNum = 0; ArrayList<String> csvLine = new ArrayList<String>(); // Check to ensure that a row was recovered from the sheet as it is // possible that one or more rows between other populated rows could be // missing - blank. If the row does contain cells then... if (row != null) { // Get the index for the right most cell on the row and then // step along the row from left to right recovering the contents // of each cell, converting that into a formatted String and // then storing the String into the csvLine ArrayList. lastCellNum = row.getLastCellNum(); for (int i = 0; i <= lastCellNum; i++) { cell = row.getCell(i); if (cell == null) { csvLine.add(""); } else if (cell.getCellType() != Cell.CELL_TYPE_FORMULA) { csvLine.add(this.formatter.formatCellValue(cell)); } else { csvLine.add(this.formatter.formatCellValue(cell, this.evaluator)); } } // Make a note of the index number of the right most cell. This value // will later be used to ensure that the matrix of data in the CSV file // is square. if (this.maxRowWidth.size() < (sheetid + 1)) { this.maxRowWidth.add(0); } if (lastCellNum > this.maxRowWidth.get(sheetid)) { this.maxRowWidth.set(sheetid, lastCellNum); } } this.csvData.get(sheetid).add(csvLine); } /** * Checks to see whether the field - which consists of the formatted * contents of an Excel worksheet cell encapsulated within a String - * contains any embedded characters that must be escaped. The method is able * to comply with either Excel's or UNIX formatting conventions in the * following manner; * * With regard to UNIX conventions, if the field contains any embedded field * separator or EOL characters they will each be escaped by prefixing a * leading backspace character. These are the only changes that have yet * emerged following some research as being required. * * Excel has other embedded character escaping requirements, some that * emerged from empirical testing, other through research. Firstly, with * regards to any embedded speech marks ("), each occurrence should be * escaped with another speech mark and the whole field then surrounded with * speech marks. Thus if a field holds <em>"Hello" he said</em> then it * should be modified to appear as <em>"""Hello"" he said"</em>. * Furthermore, if the field contains either embedded separator or EOL * characters, it should also be surrounded with speech marks. As a result * <em>1,400</em> would become * <em>"1,400"</em> assuming that the comma is the required field separator. * This has one consequence in, if a field contains embedded speech marks * and embedded separator characters, checks for both are not required as * the additional set of speech marks that should be placed around ay field * containing embedded speech marks will also account for the embedded * separator. * * It is worth making one further note with regard to embedded EOL * characters. If the data in a worksheet is exported as a CSV file using * Excel itself, then the field will be surounded with speech marks. If the * resulting CSV file is then re-imports into another worksheet, the EOL * character will result in the original simgle field occupying more than * one cell. This same 'feature' is replicated in this classes behaviour. * * @param field An instance of the String class encapsulating the formatted * contents of a cell on an Excel worksheet. * @return A String that encapsulates the formatted contents of that Excel * worksheet cell but with any embedded separator, EOL or speech mark * characters correctly escaped. */ private String escapeEmbeddedCharacters(String field) { StringBuffer buffer = null; // If the fields contents should be formatted to confrom with Excel's // convention.... if (this.formattingConvention == XSLX2CSV.EXCEL_STYLE_ESCAPING) { // Firstly, check if there are any speech marks (") in the field; // each occurrence must be escaped with another set of spech marks // and then the entire field should be enclosed within another // set of speech marks. Thus, "Yes" he said would become // """Yes"" he said" if (field.contains("\"")) { buffer = new StringBuffer(field.replaceAll("\"", "\\\"\\\"")); buffer.insert(0, "\""); buffer.append("\""); } else { // If the field contains either embedded separator or EOL // characters, then escape the whole field by surrounding it // with speech marks. buffer = new StringBuffer(field); if ((buffer.indexOf(this.separator)) > -1 || (buffer.indexOf("\n")) > -1) { buffer.insert(0, "\""); buffer.append("\""); } } return (buffer.toString().trim()); } // The only other formatting convention this class obeys is the UNIX one // where any occurrence of the field separator or EOL character will // be escaped by preceding it with a backslash. else { if (field.contains(this.separator)) { field = field.replaceAll(this.separator, ("\\\\" + this.separator)); } if (field.contains("\n")) { field = field.replaceAll("\n", "\\\\\n"); } return (field); } } /** * An instance of this class can be used to control the files returned be a * call to the listFiles() method when made on an instance of the File class * and that object refers to a folder/directory */ class ExcelFilenameFilter implements FilenameFilter { /** * Determine those files that will be returned by a call to the * listFiles() method. In this case, the name of the file must end with * either of the following two extension; '.xls' or '.xlsx'. For the * future, it is very possible to parameterise this and allow the * containing class to pass, for example, an array of Strings to this * class on instantiation. Each element in that array could encapsulate * a valid file extension - '.xls', '.xlsx', '.xlt', '.xlst', etc. These * could then be used to control which files were returned by the call * to the listFiles() method. * * @param file An instance of the File class that encapsulates a handle * referring to the folder/directory that contains the file. * @param name An instance of the String class that encapsulates the * name of the file. * @return A boolean value that indicates whether the file should be * included in the array retirned by the call to the listFiles() method. * In this case true will be returned if the name of the file ends with * either '.xls' or '.xlsx' and false will be returned in all other * instances. */ public boolean accept(File file, String name) { return (name.endsWith(".xls") || name.endsWith(".xlsx")); } } }