it.greenvulcano.excel.reader.ToCSVReader.java Source code

Introduction

Here is the source code for it.greenvulcano.excel.reader.ToCSVReader.java
Source

/*******************************************************************************
 * Copyright (c) 2009, 2016 GreenVulcano ESB Open Source Project.
 * All rights reserved.
 *
 * This file is part of GreenVulcano ESB.
 *
 * GreenVulcano ESB is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *  
 * GreenVulcano ESB is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *  
 * You should have received a copy of the GNU Lesser General Public License
 * along with GreenVulcano ESB. If not, see <http://www.gnu.org/licenses/>.
 *******************************************************************************/
package it.greenvulcano.excel.reader;

import it.greenvulcano.configuration.XMLConfig;
import it.greenvulcano.excel.exception.ExcelException;
import it.greenvulcano.util.txt.TextUtils;

import java.util.ArrayList;
import java.util.List;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.w3c.dom.Node;

/**
 * 
 * @version 3.4.0 20/apr/2013
 * @author GreenVulcano Developer Team
 * 
 * Based on ToCSV POI example. Convert an Excel spreadsheet into a CSV
 * file. This class makes the following assumptions; 
 * <list> 
 * <li>1. Where the Excel workbook contains more that one worksheet, then a single
 * CSV file will contain the data from all of the worksheets.</li> 
 * <li>2. The data matrix contained in the CSV file will be square. This
 * means that the number of fields in each record of the CSV file will
 * match the number of cells in the longest row found in the Excel
 * workbook. Any short records will be 'padded' with empty fields - an
 * empty field is represented in the the CSV file in this way - ,,.</li>
 * <li>3. Empty fields will represent missing cells.</li> 
 * <li>4. A record consisting of empty fields will be used to represent an empty
 * row in the Excel workbook.</li> 
 * </list> 
 * Therefore, if the worksheet looked like this;
 * 
 * <pre>
 *  ___________________________________________
 *     |       |       |       |       |       |
 *     |   A   |   B   |   C   |   D   |   E   |
 *  ___|_______|_______|_______|_______|_______|
 *     |       |       |       |       |       |
 *   1 |   1   |   2   |   3   |   4   |   5   |
 *  ___|_______|_______|_______|_______|_______|
 *     |       |       |       |       |       |
 *   2 |       |       |       |       |       |
 *  ___|_______|_______|_______|_______|_______|
 *     |       |       |       |       |       |
 *   3 |       |   A   |       |   B   |       |
 *  ___|_______|_______|_______|_______|_______|
 *     |       |       |       |       |       |
 *   4 |       |       |       |       |   Z   |
 *  ___|_______|_______|_______|_______|_______|
 *     |       |       |       |       |       |
 *   5 | 1,400 |       |  250  |       |       |
 *  ___|_______|_______|_______|_______|_______|
 * 
 * </pre>
 * 
 * Then, the resulting CSV file will contain the following lines (records);
 * <pre>
 * 1,2,3,4,5
 * ,,,,
 * ,A,,B,
 * ,,,,Z
 * "1,400",,250,,
 * </pre>
 * <p>
 * Typically, the comma is used to separate each of the fields that,
 * together, constitute a single record or line within the CSV file.
 * This is not however a hard and fast rule and so this class allows the
 * user to determine which character is used as the field separator and
 * assumes the comma if none other is specified.
 * </p>
 * <p>
 * If a field contains the separator then it will be escaped. If the
 * file should obey Excel's CSV formatting rules, then the field will be
 * surrounded with speech marks whilst if it should obey UNIX
 * conventions, each occurrence of the separator will be preceded by the
 * backslash character.
 * </p>
 * <p>
 * If a field contains an end of line (EOL) character then it too will
 * be escaped. If the file should obey Excel's CSV formatting rules then
 * the field will again be surrounded by speech marks. On the other
 * hand, if the file should follow UNIX conventions then a single
 * backslash will precede the EOL character. There is no single
 * applicable standard for UNIX and some applications replace the CR
 * with \r and the LF with \n but this class will not do so.
 * </p>
 * <p>
 * If the field contains double quotes then that character will be
 * escaped. It seems as though UNIX does not define a standard for this
 * whilst Excel does. Should the CSV file have to obey Excel's
 * formatting rules then the speech mark character will be escaped with
 * a second set of speech marks. Finally, an enclosing set of speah
 * marks will also surround the entire field. Thus, if the following
 * line of text appeared in a cell - "Hello" he said - it would look
 * like this when converted into a field within a CSV file - """Hello""
 * he said".
 * </p>
 * 
 */
public class ToCSVReader extends BaseReader {
    /**
    *
    */
    protected static final String DEFAULT_END_LINE = "LF";

    /**
    *
    */
    protected static final String DEFAULT_SEPARATOR = ",";

    /**
     * Identifies that the CSV file should obey Excel's formatting conventions
     * with regard to escaping certain embedded characters - the field
     * separator, speech mark and end of line (EOL) character
     */
    public static final int EXCEL_STYLE_ESCAPING = 0;

    /**
     * Identifies that the CSV file should obey UNIX formatting conventions with
     * regard to escaping certain embedded characters - the field separator and
     * end of line (EOL) character
     */
    public static final int UNIX_STYLE_ESCAPING = 1;

    private String endLine;
    private String separator;
    private int formattingConvention = -1;

    private StringBuilder csvData = null;
    private List<List<String>> csvRows = null;
    private int maxRowWidth = 0;

    public ToCSVReader() {
        setSeparator(DEFAULT_SEPARATOR);
        setEndLine(TextUtils.getEOL(DEFAULT_END_LINE));
        formattingConvention = EXCEL_STYLE_ESCAPING;
    }

    @Override
    public void init(Node node) throws ExcelException {
        super.init(node);
        try {
            setSeparator(XMLConfig.get(node, "@separator", DEFAULT_SEPARATOR).replaceAll("\\\\t", "\t"));
            setEndLine(TextUtils.getEOL(XMLConfig.get(node, "@end-line", DEFAULT_END_LINE)));
            String formConv = XMLConfig.get(node, "@formatting-style", "excel");
            if (formConv.equals("excel")) {
                setFormattingConvention(EXCEL_STYLE_ESCAPING);
            } else if (formConv.equals("unix")) {
                setFormattingConvention(UNIX_STYLE_ESCAPING);
            } else {
                throw new ExcelException("Error initializing ExcelReader: invalid formatting [" + formConv + "]");
            }
        } catch (ExcelException exc) {
            throw exc;
        } catch (Exception exc) {
            throw new ExcelException("Error initializing ExcelReader", exc);
        }
    }

    public void setEndLine(String endLine) {
        this.endLine = endLine;
    }

    public String getEndLine() {
        return this.endLine;
    }

    public void setSeparator(String separator) {
        this.separator = separator;
    }

    public String getSeparator() {
        return this.separator;
    }

    public int getFormattingConvention() {
        return this.formattingConvention;
    }

    public void setFormattingConvention(int formattingConvention) {
        this.formattingConvention = formattingConvention;
    }

    @Override
    protected void startProcess() throws ExcelException {
        csvData = new StringBuilder();
        csvRows = new ArrayList<List<String>>();
    }

    @Override
    protected boolean processSheet(Sheet sheet, int sNum) throws ExcelException {
        return sheet.getPhysicalNumberOfRows() > 0;
    }

    /**
     * Called to convert a row of cells into a line of data that can be output
     * to the CSV file.
     * 
     * @param row
     *        An instance of either the HSSFRow or XSSFRow classes that
     *        encapsulates information about a row of cells recovered from an
     *        Excel workbook.
     */
    @Override
    protected void processRow(Row row, int sNum, int rNum) throws ExcelException {
        List<String> csvLine = new ArrayList<String>();

        // Check to ensure that a row was recovered from the sheet as it is
        // possible that one or more rows between other populated rows could be
        // missing - blank. If the row does contain cells then...
        if (row != null) {

            // Get the index for the right most cell on the row and then
            // step along the row from left to right recovering the contents
            // of each cell, converting that into a formatted String and
            // then storing the String into the csvLine ArrayList.
            int lastCellNum = row.getLastCellNum();
            int skipped = 0;
            for (int i = 0; i <= lastCellNum; i++) {
                if (colSkipper.skip(sNum, i)) {
                    skipped++;
                    continue;
                }
                Cell cell = row.getCell(i);
                if (cell == null) {
                    csvLine.add("");
                } else {
                    if (cell.getCellType() != Cell.CELL_TYPE_FORMULA) {
                        csvLine.add(formatter.formatCellValue(cell));
                    } else {
                        csvLine.add(formatter.formatCellValue(cell, evaluator));
                    }
                }
            }
            // Make a note of the index number of the right most cell. This value
            // will later be used to ensure that the matrix of data in the CSV
            // file is square.
            if (lastCellNum > maxRowWidth) {
                maxRowWidth = lastCellNum - skipped;
            }
        }
        csvRows.add(csvLine);
    }

    /**
     * Called to actually save the data recovered from the Excel workbook as a
     * CSV file.
     * 
     */
    @Override
    protected void endProcess() throws ExcelException {
        List<String> line = null;
        String csvLineElement = null;

        // Step through the elements of the ArrayList that was used to hold
        // all of the data recovered from the Excel workbooks' sheets, rows
        // and cells.
        for (int i = 0; i < csvRows.size(); i++) {
            // Get an element from the ArrayList that contains the data for
            // the workbook. This element will itself be an ArrayList
            // containing Strings and each String will hold the data recovered
            // from a single cell. The for() loop is used to recover elements
            // from this 'row' ArrayList one at a time and to write the Strings
            // away to a StringBuffer thus assembling a single line for inclusion
            // in the CSV file. If a row was empty or if it was short, then
            // the ArrayList that contains it's data will also be shorter than
            // some of the others. Therefore, it is necessary to check within
            // the for loop to ensure that the ArrayList contains data to be
            // processed. If it does, then an element will be recovered and
            // appended to the StringBuffer.
            line = csvRows.get(i);
            for (int j = 0; j < this.maxRowWidth; j++) {
                if (line.size() > j) {
                    csvLineElement = line.get(j);
                    if (csvLineElement != null) {
                        csvData.append(escapeEmbeddedCharacters(csvLineElement));
                    }
                }
                if (j < (maxRowWidth - 1)) {
                    csvData.append(separator);
                }
            }

            // Condition the inclusion of new line characters so as to
            // avoid an additional, superfluous, new line at the end of
            // the file.
            if (i < (csvRows.size() - 1)) {
                csvData.append(endLine);
            }
        }
    }

    /**
     * Checks to see whether the field - which consists of the formatted
     * contents of an Excel worksheet cell encapsulated within a String -
     * contains any embedded characters that must be escaped. The method is able
     * to comply with either Excel's or UNIX formatting conventions in the
     * following manner;
     * 
     * With regard to UNIX conventions, if the field contains any embedded field
     * separator or EOL characters they will each be escaped by prefixing a
     * leading backspace character. These are the only changes that have yet
     * emerged following some research as being required.
     * 
     * Excel has other embedded character escaping requirements, some that
     * emerged from empirical testing, other through research. Firstly, with
     * regards to any embedded speech marks ("), each occurrence should be
     * escaped with another speech mark and the whole field then surrounded with
     * speech marks. Thus if a field holds <em>"Hello" he said</em> then it
     * should be modified to appear as <em>"""Hello"" he said"</em>.
     * Furthermore, if the field contains either embedded separator or EOL
     * characters, it should also be surrounded with speech marks. As a result
     * <em>1,400</em> would become <em>"1,400"</em> assuming that the comma is
     * the required field separator. This has one consequence in, if a field
     * contains embedded speech marks and embedded separator characters, checks
     * for both are not required as the additional set of speech marks that
     * should be placed around ay field containing embedded speech marks will
     * also account for the embedded separator.
     * 
     * It is worth making one further note with regard to embedded EOL
     * characters. If the data in a worksheet is exported as a CSV file using
     * Excel itself, then the field will be surounded with speech marks. If the
     * resulting CSV file is then re-imports into another worksheet, the EOL
     * character will result in the original simgle field occupying more than
     * one cell. This same 'feature' is replicated in this classes behaviour.
     * 
     * @param field
     *        An instance of the String class encapsulating the formatted
     *        contents of a cell on an Excel worksheet.
     * @return A String that encapsulates the formatted contents of that Excel
     *         worksheet cell but with any embedded separator, EOL or speech
     *         mark characters correctly escaped.
     */
    private String escapeEmbeddedCharacters(String field) {
        StringBuffer buffer = null;

        // If the fields contents should be formatted to confrom with Excel's
        // convention....
        if (this.formattingConvention == EXCEL_STYLE_ESCAPING) {

            // Firstly, check if there are any speech marks (") in the field;
            // each occurrence must be escaped with another set of spech marks
            // and then the entire field should be enclosed within another
            // set of speech marks. Thus, "Yes" he said would become
            // """Yes"" he said"
            if (field.contains("\"")) {
                buffer = new StringBuffer(field.replaceAll("\"", "\\\"\\\""));
                buffer.insert(0, "\"");
                buffer.append("\"");
            } else {
                // If the field contains either embedded separator or EOL
                // characters, then escape the whole field by surrounding it
                // with speech marks.
                buffer = new StringBuffer(field);
                if ((buffer.indexOf(separator)) > -1 || (buffer.indexOf("\n")) > -1) {
                    buffer.insert(0, "\"");
                    buffer.append("\"");
                }
            }
            return (buffer.toString().trim());
        }
        // The only other formatting convention this class obeys is the UNIX one
        // where any occurrence of the field separator or EOL character will
        // be escaped by preceding it with a backslash.
        else {
            if (field.contains(this.separator)) {
                field = field.replaceAll(this.separator, ("\\\\" + this.separator));
            }
            if (field.contains("\n")) {
                field = field.replaceAll("\n", "\\\\\n");
            }
            return (field);
        }
    }

    @Override
    public Object getAsObject() throws ExcelException {
        if (csvData == null) {
            throw new ExcelException("No Excel parsed");
        }
        return csvData.toString();
    }

    @Override
    public String getAsString() throws ExcelException {
        return getAsObject().toString();
    }

    @Override
    public byte[] getAsBytes() throws ExcelException {
        return getAsString().getBytes();
    }

    @Override
    public void cleanUp() {
        csvData = null;
        csvRows = null;
    }

    @Override
    public void destroy() {
        cleanUp();
    }

}