org.pentaho.di.trans.steps.excelinput.staxpoi.StaxPoiSheet.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.trans.steps.excelinput.staxpoi.StaxPoiSheet.java

Source

/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2015 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

/**
 * Author = Shailesh Ahuja
 */

package org.pentaho.di.trans.steps.excelinput.staxpoi;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.TimeZone;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTXf;
import org.pentaho.di.core.spreadsheet.KCell;
import org.pentaho.di.core.spreadsheet.KCellType;
import org.pentaho.di.core.spreadsheet.KSheet;

/**
 * Streaming reader for XLSX sheets.<br>
 * Rows should only be accessed sequentially: random access will severely impact performance.<br>
 */
public class StaxPoiSheet implements KSheet {

    // set to UTC for coherence with PoiSheet;
    private static final TimeZone DATE_TZ = TimeZone.getTimeZone("UTC");

    private final String sheetName;
    private final String sheetId;

    private final XSSFReader xssfReader;
    private InputStream sheetStream;
    private XMLStreamReader sheetReader;

    // hold the pointer to the current row so that access to the next row in the stream is quick and easy
    private int currentRow;
    private List<String> headerRow;
    private int numRows;
    private int numCols;
    // 1-based first non-empty row
    private int firstRow;
    private KCell[] currentRowCells;

    // full shared strings table
    private SharedStringsTable sst;
    // custom styles
    private StylesTable styles;

    public StaxPoiSheet(XSSFReader reader, String sheetName, String sheetID)
            throws InvalidFormatException, IOException, XMLStreamException {
        this.sheetName = sheetName;
        xssfReader = reader;
        sheetId = sheetID;
        sst = reader.getSharedStringsTable();
        styles = reader.getStylesTable();
        sheetStream = reader.getSheet(sheetID);
        XMLInputFactory factory = XMLInputFactory.newInstance();
        sheetReader = factory.createXMLStreamReader(sheetStream);
        headerRow = new ArrayList<String>();
        while (sheetReader.hasNext()) {
            int event = sheetReader.next();
            if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("dimension")) {
                String dim = sheetReader.getAttributeValue(null, "ref");
                // empty sheets have dimension with no range
                if (StringUtils.contains(dim, ':')) {
                    dim = dim.split(":")[1];
                    numRows = StaxUtil.extractRowNumber(dim);
                    numCols = StaxUtil.extractColumnNumber(dim);
                }
            }
            if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("row")) {
                currentRow = Integer.parseInt(sheetReader.getAttributeValue(null, "r"));
                firstRow = currentRow;

                // calculate the number of columns in the header row
                while (sheetReader.hasNext()) {
                    event = sheetReader.next();
                    if (event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals("row")) {
                        // if the row has ended, break the inner while loop
                        break;
                    }
                    if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("c")) {
                        String attributeValue = sheetReader.getAttributeValue(null, "t");
                        if (attributeValue != null && attributeValue.equals("s")) {
                            // only if the type of the cell is string, we continue
                            while (sheetReader.hasNext()) {
                                event = sheetReader.next();
                                if (event == XMLStreamConstants.START_ELEMENT
                                        && sheetReader.getLocalName().equals("v")) {
                                    int idx = Integer.parseInt(sheetReader.getElementText());
                                    String content = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
                                    headerRow.add(content);
                                    break;
                                }
                            }
                        } else {
                            break;
                        }
                    }
                }
                // we have parsed the header row
                break;
            }
        }
    }

    @Override
    public KCell[] getRow(int rownr) {
        // xlsx raw row numbers are 1-based index, KSheet is 0-based

        if (rownr < 0 || rownr >= numRows) {
            // KSheet requires out of bounds here
            throw new ArrayIndexOutOfBoundsException(rownr);
        }
        if (rownr + 1 < firstRow) {
            // before first non-empty row
            return new KCell[0];
        }
        if (rownr > 0 && currentRow == rownr + 1) {
            return currentRowCells;
        }
        try {
            if (currentRow >= rownr + 1) {
                // allow random access per api despite performance hit
                resetSheetReader();
            }
            while (sheetReader.hasNext()) {
                int event = sheetReader.next();
                if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("row")) {
                    String rowIndicator = sheetReader.getAttributeValue(null, "r");
                    currentRow = Integer.parseInt(rowIndicator);
                    if (currentRow < rownr + 1) {
                        continue;
                    }
                    currentRowCells = parseRow();
                    return currentRowCells;
                }
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        numRows = currentRow;
        return new KCell[] {};
    }

    private KCell[] parseRow() throws XMLStreamException {
        KCell[] cells = new StaxPoiCell[numCols];
        for (int i = 0; i < numCols; i++) {
            // go to the "c" cell tag
            while (sheetReader.hasNext()) {
                int event = sheetReader.next();
                if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("c")) {
                    break;
                }
                if (event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals("row")) {
                    // premature end of row, returning what we have
                    return cells;
                }
            }
            String cellLocation = sheetReader.getAttributeValue(null, "r");
            int columnIndex = StaxUtil.extractColumnNumber(cellLocation) - 1;

            String cellType = sheetReader.getAttributeValue(null, "t");
            String cellStyle = sheetReader.getAttributeValue(null, "s");

            boolean isFormula = false;
            String content = null;
            // get value tag
            while (sheetReader.hasNext()) {
                int event = sheetReader.next();
                if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("v")) {
                    // read content as string
                    if (cellType != null && cellType.equals("s")) {
                        int idx = Integer.parseInt(sheetReader.getElementText());
                        content = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
                    } else {
                        content = sheetReader.getElementText();
                    }
                }
                if (event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals("f")) {
                    isFormula = true;
                }
                if (event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals("c")) {
                    break;
                }
            }
            if (content != null) {
                KCellType kcType = getCellType(cellType, cellStyle, isFormula);
                cells[columnIndex] = new StaxPoiCell(parseValue(kcType, content), kcType, currentRow);
            }
            // else let cell be null
        }
        return cells;
    }

    @Override
    public String getName() {
        return sheetName;
    }

    @Override
    public int getRows() {
        return numRows;
    }

    @Override
    public KCell getCell(int colnr, int rownr) {
        if (rownr == 0 && colnr < headerRow.size()) {
            // only possible to return header
            return new StaxPoiCell(headerRow.get(colnr), rownr);
        }
        // if random access this will be very expensive
        KCell[] row = getRow(rownr);
        if (row != null && rownr < row.length) {
            return row[colnr];
        }
        return null;
    }

    private KCellType getCellType(String cellType, String cellStyle, boolean isFormula) {
        // numeric type can be implicit or 'n'
        if (cellType == null || cellType.equals("n")) {
            // the only difference between date and numeric is the cell format
            if (isDateCell(cellStyle)) {
                return isFormula ? KCellType.DATE_FORMULA : KCellType.DATE;
            }
            return isFormula ? KCellType.NUMBER_FORMULA : KCellType.NUMBER;
        }
        switch (cellType) {
        case "s":
            return KCellType.LABEL;
        case "b":
            return isFormula ? KCellType.BOOLEAN_FORMULA : KCellType.BOOLEAN;
        case "e":
            // error
            return KCellType.EMPTY;
        case "str":
        default:
            return KCellType.STRING_FORMULA;
        }
    }

    private boolean isDateCell(String cellStyle) {
        if (cellStyle != null) {
            int styleIdx = Integer.parseInt(cellStyle);
            CTXf cellXf = styles.getCellXfAt(styleIdx);
            if (cellXf != null) {
                // need id for builtin types, format if custom
                int formatId = (int) cellXf.getNumFmtId();
                String format = styles.getNumberFormatAt(formatId);
                return DateUtil.isADateFormat(formatId, format);
            }
        }
        return false;
    }

    private Object parseValue(KCellType type, String vContent) {
        if (vContent == null) {
            return null;
        }
        try {
            switch (type) {
            case NUMBER:
            case NUMBER_FORMULA:
                return Double.parseDouble(vContent);
            case BOOLEAN:
            case BOOLEAN_FORMULA:
                return vContent.equals("1");
            case DATE:
            case DATE_FORMULA:
                Double xlDate = Double.parseDouble(vContent);
                return DateUtil.getJavaDate(xlDate, DATE_TZ);
            case LABEL:
            case STRING_FORMULA:
            case EMPTY:
            default:
                return vContent;
            }
        } catch (Exception e) {
            return vContent;
        }
    }

    private void resetSheetReader() throws IOException, XMLStreamException, InvalidFormatException {
        sheetReader.close();
        sheetStream.close();
        sheetStream = xssfReader.getSheet(sheetId);
        XMLInputFactory factory = XMLInputFactory.newInstance();
        sheetReader = factory.createXMLStreamReader(sheetStream);
    }

    public void close() throws IOException, XMLStreamException {
        sheetReader.close();
        sheetStream.close();
    }
}