org.pentaho.di.trans.steps.xmlinputstream.XMLInputStream.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.trans.steps.xmlinputstream.XMLInputStream.java

Source

/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.xmlinputstream;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.XMLEvent;

import org.apache.commons.vfs2.FileSystemException;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;

/**
 * Use a StAX parser to read XML in a flexible and fast way.
 *
 * @author Jens Bleuel
 * @since 2011-01-13
 */
// TODO black box testing
public class XMLInputStream extends BaseStep implements StepInterface {
    private static Class<?> PKG = XMLInputStream.class; // for i18n purposes, needed by Translator2!!

    private static int PARENT_ID_ALLOCATE_SIZE = 1000; // max. number of nested elements, we may let the user configure
                                                       // this

    private XMLInputStreamMeta meta;

    private XMLInputStreamData data;

    private int inputFieldIndex;

    static final String[] eventDescription = { "UNKNOWN", "START_ELEMENT", "END_ELEMENT", "PROCESSING_INSTRUCTION",
            "CHARACTERS", "COMMENT", "SPACE", "START_DOCUMENT", "END_DOCUMENT", "ENTITY_REFERENCE", "ATTRIBUTE",
            "DTD", "CDATA", "NAMESPACE", "NOTATION_DECLARATION", "ENTITY_DECLARATION" };

    public XMLInputStream(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    @Override
    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        if (first && !meta.sourceFromInput) {
            first = false;

            if (data.filenames == null) {
                getFilenamesFromPreviousSteps();
            }
            openNextFile();
            resetElementCounters();
        }

        Object[] outputRowData;
        if (meta.sourceFromInput) {
            Object[] row = null;
            if (first) {
                first = false;
                row = getRow();
                // get input field index
                if (getInputRowMeta() == null) {
                    throw new KettleException(BaseMessages.getString(PKG, "XMLInputStream.NoIncomingRowsFound"));
                }
                inputFieldIndex = getInputRowMeta().indexOfValue(meta.sourceFieldName);
                if (inputFieldIndex < 0) {
                    throw new KettleException(BaseMessages.getString(PKG, "XMLInputStream.FilenameFieldNotFound",
                            meta.sourceFieldName));
                }
            }
            if (data.xmlEventReader == null) {
                if (row == null) {
                    row = getRow();
                }
                if (row == null) {
                    setOutputDone(); // signal end to receiver(s)
                    return false; // This is the end of this step.
                }
                String xml = getInputRowMeta().getString(row, inputFieldIndex);
                try {
                    data.xmlEventReader = data.staxInstance.createXMLEventReader(new StringReader(xml));
                } catch (XMLStreamException e) {
                    throw new KettleException(e);
                }
                resetElementCounters();
            }
            outputRowData = getRowFromXML();
            if (outputRowData == null) {
                data.xmlEventReader = null;
                return true;
            }
        } else {
            outputRowData = getRowFromXML();
            if (outputRowData == null) {
                if (openNextFile()) {
                    resetElementCounters();
                    return true;
                } else {
                    setOutputDone(); // signal end to receiver(s)
                    return false; // This is the end of this step.
                }
            }
        }

        putRowOut(outputRowData);

        // limit has been reached: stop now. (not exact science since some attributes could be mixed within the last row)
        if (data.rowLimit > 0 && data.rowNumber >= data.rowLimit) {
            setOutputDone();
            return false;
        }
        return true;
    }

    private boolean openNextFile() throws KettleException {
        try {
            closeFile();
            if (data.filenr >= data.filenames.length) {
                return false;
            }
            data.fileObject = KettleVFS.getFileObject(data.filenames[data.filenr], getTransMeta());
            data.inputStream = KettleVFS.getInputStream(data.fileObject);
            data.xmlEventReader = data.staxInstance.createXMLEventReader(data.inputStream, data.encoding);
        } catch (IOException e) {
            throw new KettleException(e);
        } catch (XMLStreamException e) {
            throw new KettleException(e);
        }
        data.filenr++;
        if (meta.isAddResultFile()) {
            // Add this to the result file names...
            ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, data.fileObject,
                    getTransMeta().getName(), getStepname());
            resultFile.setComment(BaseMessages.getString(PKG, "XMLInputStream.Log.ResultFileWasRead"));
            addResultFile(resultFile);
        }
        return true;
    }

    private void closeFile() {
        if (data.xmlEventReader != null) {
            try {
                data.xmlEventReader.close();
            } catch (XMLStreamException e) {
                if (log.isBasic()) {
                    log.logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.UnableToCloseFile",
                            data.filenames[(data.filenr - 1)]), e);
                }
            }
        }
        if (data.inputStream != null) {
            try {
                data.inputStream.close();
            } catch (IOException e) {
                if (log.isBasic()) {
                    log.logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.UnableToCloseFile",
                            data.filenames[(data.filenr - 1)]), e);
                }
            }
        }
        if (data.fileObject != null) {
            try {
                data.fileObject.close();
            } catch (FileSystemException e) {
                if (log.isBasic()) {
                    log.logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.UnableToCloseFile",
                            data.filenames[(data.filenr - 1)]), e);
                }
            }
        }
    }

    private void getFilenamesFromPreviousSteps() throws KettleException {
        List<String> filenames = new ArrayList<String>();
        int index = -1;

        Object[] row = getRow();

        // Get the filename field index...
        //
        String filenameField = environmentSubstitute(meta.getFilename());
        index = getInputRowMeta().indexOfValue(filenameField);
        if (index < 0) {
            throw new KettleException(
                    BaseMessages.getString(PKG, "XMLInputStream.FilenameFieldNotFound", filenameField));
        }

        while (row != null) {

            String filename = getInputRowMeta().getString(row, index);
            filenames.add(filename); // add it to the list...

            row = getRow(); // Grab another row...
        }

        data.filenames = filenames.toArray(new String[filenames.size()]);

        logDetailed(BaseMessages.getString(PKG, "XMLInputStream.Log.ReadingFromNrFiles",
                Integer.toString(data.filenames.length)));
    }

    // sends the normal row and attributes
    private void putRowOut(Object[] r) throws KettleStepException, KettleValueException {

        data.rowNumber++;
        if (data.pos_xml_filename != -1) {
            r[data.pos_xml_filename] = new String(data.filenames[(data.filenr - 1)]);
        }
        if (data.pos_xml_row_number != -1) {
            r[data.pos_xml_row_number] = new Long(data.rowNumber);
        }
        if (data.pos_xml_element_id != -1) {
            r[data.pos_xml_element_id] = data.elementLevelID[data.elementLevel];
        }
        if (data.pos_xml_element_level != -1) {
            r[data.pos_xml_element_level] = new Long(data.elementLevel);
        }
        if (data.pos_xml_parent_element_id != -1) {
            r[data.pos_xml_parent_element_id] = data.elementParentID[data.elementLevel];
        }
        if (data.pos_xml_path != -1) {
            r[data.pos_xml_path] = data.elementPath[data.elementLevel];
        }
        if (data.pos_xml_parent_path != -1 && data.elementLevel > 0) {
            r[data.pos_xml_parent_path] = data.elementPath[data.elementLevel - 1];
        }

        // We could think of adding an option to filter Start_end Document / Elements, RegEx?
        // We could think of adding columns identifying Element-Blocks

        // Skip rows? (not exact science since some attributes could be mixed within the last row)
        if (data.nrRowsToSkip == 0 || data.rowNumber > data.nrRowsToSkip) {
            if (log.isRowLevel()) {
                logRowlevel("Read row: " + data.outputRowMeta.getString(r));
            }
            putRow(data.outputRowMeta, r);
        }
    }

    private Object[] getRowFromXML() throws KettleException {

        Object[] outputRowData = null;
        // loop until significant data is there and more data is there
        while (data.xmlEventReader.hasNext() && outputRowData == null && !isStopped()) {
            outputRowData = processEvent();
            // log all events (but no attributes sent by the EventReader)
            incrementLinesInput();
            if (checkFeedback(getLinesInput()) && isBasic()) {
                logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.LineNumber",
                        Long.toString(getLinesInput())));
            }
        }

        return outputRowData;
    }

    private Object[] processEvent() throws KettleException {

        Object[] outputRowData = RowDataUtil.allocateRowData(data.outputRowMeta.size());
        XMLEvent e = null;
        try {
            e = data.xmlEventReader.nextEvent();
        } catch (XMLStreamException ex) {
            throw new KettleException(ex);
        }

        int eventType = e.getEventType();
        if (data.pos_xml_data_type_numeric != -1) {
            outputRowData[data.pos_xml_data_type_numeric] = new Long(eventType);
        }
        if (data.pos_xml_data_type_description != -1) {
            if (eventType == 0 || eventType > eventDescription.length) {
                // unknown eventType
                outputRowData[data.pos_xml_data_type_description] = eventDescription[0] + "(" + eventType + ")";
            } else {
                outputRowData[data.pos_xml_data_type_description] = eventDescription[eventType];
            }
        }
        if (data.pos_xml_location_line != -1) {
            outputRowData[data.pos_xml_location_line] = new Long(e.getLocation().getLineNumber());
        }
        if (data.pos_xml_location_column != -1) {
            outputRowData[data.pos_xml_location_column] = new Long(e.getLocation().getColumnNumber());
        }

        switch (eventType) {

        case XMLStreamConstants.START_ELEMENT:
            data.elementLevel++;
            if (data.elementLevel > PARENT_ID_ALLOCATE_SIZE - 1) {
                throw new KettleException(BaseMessages.getString(PKG, "XMLInputStream.Log.TooManyNestedElements",
                        PARENT_ID_ALLOCATE_SIZE));
            }
            if (data.elementParentID[data.elementLevel] == null) {
                data.elementParentID[data.elementLevel] = data.elementID;
            }
            data.elementID++;
            data.elementLevelID[data.elementLevel] = data.elementID;

            String xml_data_name;
            if (meta.isEnableNamespaces()) {
                String prefix = e.asStartElement().getName().getPrefix();
                if (Utils.isEmpty(prefix)) {
                    xml_data_name = e.asStartElement().getName().getLocalPart();
                } else { // add namespace prefix:
                    xml_data_name = prefix + ":" + e.asStartElement().getName().getLocalPart();
                }
            } else {
                xml_data_name = e.asStartElement().getName().getLocalPart();
            }
            if (data.pos_xml_data_name >= 0) {
                outputRowData[data.pos_xml_data_name] = xml_data_name;
            }
            // store the name
            data.elementName[data.elementLevel] = xml_data_name;
            // store simple path
            data.elementPath[data.elementLevel] = data.elementPath[data.elementLevel - 1] + "/" + xml_data_name;

            // write Namespaces out
            if (meta.isEnableNamespaces()) {
                outputRowData = parseNamespaces(outputRowData, e);
            }

            // write Attributes out
            outputRowData = parseAttributes(outputRowData, e);

            break;

        case XMLStreamConstants.END_ELEMENT:
            parseEndElement(outputRowData, e.asEndElement());
            putRowOut(outputRowData);
            data.elementParentID[data.elementLevel + 1] = null;
            data.elementLevel--;
            outputRowData = null; // continue
            break;

        case XMLStreamConstants.SPACE:
            outputRowData = null; // ignore & continue
            break;

        case XMLStreamConstants.CHARACTERS:
        case XMLStreamConstants.CDATA:
            if (data.pos_xml_data_name >= 0) {
                outputRowData[data.pos_xml_data_name] = data.elementName[data.elementLevel];
            }
            String xml_data_value = e.asCharacters().getData();
            if (data.pos_xml_data_value >= 0) {
                if (meta.isEnableTrim()) {
                    // optional trim is also eliminating white spaces, tab, cr, lf
                    xml_data_value = Const.trim(xml_data_value);
                }
                outputRowData[data.pos_xml_data_value] = xml_data_value;
            }

            if (data.pos_xml_data_value < 0 || Utils.isEmpty((String) outputRowData[data.pos_xml_data_value])) {
                outputRowData = null; // ignore & continue
            }
            break;

        case XMLStreamConstants.PROCESSING_INSTRUCTION:
            outputRowData = null; // ignore & continue
            // TODO test if possible
            break;

        case XMLStreamConstants.COMMENT:
            outputRowData = null; // ignore & continue
            // TODO test if possible
            break;

        case XMLStreamConstants.ENTITY_REFERENCE:
            // should be resolved by default
            outputRowData = null; // ignore & continue
            break;

        case XMLStreamConstants.START_DOCUMENT:
            // just get this information out
            break;

        case XMLStreamConstants.END_DOCUMENT:
            // just get this information out
            break;

        default:
            logBasic("Event:" + eventType);
            outputRowData = null; // ignore & continue
        }

        return outputRowData;
    }

    private void parseEndElement(Object[] outputRowData, EndElement el) {
        if (data.pos_xml_data_name >= 0) {
            outputRowData[data.pos_xml_data_name] = getEndElementName(el, meta.isEnableNamespaces());
        }
    }

    /**
     * Returns the qualified name of the end element
     *
     * @param el
     *          an EndElement event
     * @param enabledNamespaces
     *          indicates if namespaces should be added or not
     * @return the qualified name of the end element
     */
    private String getEndElementName(EndElement el, boolean enabledNamespaces) {
        if (!enabledNamespaces) {
            return el.getName().getLocalPart();
        } else {
            return getName(el.getName().getPrefix(), el.getName().getLocalPart());
        }
    }

    // Namespaces: put an extra row out for each namespace
    @SuppressWarnings("unchecked")
    private Object[] parseNamespaces(Object[] outputRowData, XMLEvent e)
            throws KettleValueException, KettleStepException {
        Iterator<Namespace> iter = e.asStartElement().getNamespaces();
        if (iter.hasNext()) {
            Object[] outputRowDataNamespace = data.outputRowMeta.cloneRow(outputRowData);
            putRowOut(outputRowDataNamespace); // first put the element name info out
            // change data_type to ATTRIBUTE
            if (data.pos_xml_data_type_numeric != -1) {
                outputRowData[data.pos_xml_data_type_numeric] = new Long(XMLStreamConstants.NAMESPACE);
            }
            if (data.pos_xml_data_type_description != -1) {
                outputRowData[data.pos_xml_data_type_description] = eventDescription[XMLStreamConstants.NAMESPACE];
            }
        }
        while (iter.hasNext()) {
            Object[] outputRowDataNamespace = data.outputRowMeta.cloneRow(outputRowData);
            Namespace n = iter.next();
            outputRowDataNamespace[data.pos_xml_data_name] = n.getPrefix();
            outputRowDataNamespace[data.pos_xml_data_value] = n.getNamespaceURI();
            if (iter.hasNext()) {
                // send out the Namespace row
                putRowOut(outputRowDataNamespace);
            } else {
                // last row: this will be sent out by the outer loop
                outputRowData = outputRowDataNamespace;
            }
        }

        return outputRowData;
    }

    // Attributes: put an extra row out for each attribute
    @SuppressWarnings("unchecked")
    private Object[] parseAttributes(Object[] outputRowData, XMLEvent e)
            throws KettleValueException, KettleStepException {
        Iterator<Attribute> iter = e.asStartElement().getAttributes();
        if (iter.hasNext()) {
            Object[] outputRowDataAttribute = data.outputRowMeta.cloneRow(outputRowData);
            putRowOut(outputRowDataAttribute); // first put the element name (or namespace) info out
            // change data_type to ATTRIBUTE
            if (data.pos_xml_data_type_numeric != -1) {
                outputRowData[data.pos_xml_data_type_numeric] = new Long(XMLStreamConstants.ATTRIBUTE);
            }
            if (data.pos_xml_data_type_description != -1) {
                outputRowData[data.pos_xml_data_type_description] = eventDescription[XMLStreamConstants.ATTRIBUTE];
            }
        }
        while (iter.hasNext()) {
            Object[] outputRowDataAttribute = data.outputRowMeta.cloneRow(outputRowData);
            Attribute a = iter.next();
            parseAttribute(outputRowDataAttribute, a, meta.isEnableNamespaces());
            if (iter.hasNext()) {
                // send out the Attribute row
                putRowOut(outputRowDataAttribute);
            } else {
                // last row: this will be sent out by the outer loop
                outputRowData = outputRowDataAttribute;
            }
        }

        return outputRowData;
    }

    private void parseAttribute(Object[] outputRowDataAttribute, Attribute a, boolean enabledNamespaces) {
        if (data.pos_xml_data_name != -1) {
            outputRowDataAttribute[data.pos_xml_data_name] = getAttributeName(a, enabledNamespaces);
        }
        if (data.pos_xml_data_value != -1) {
            outputRowDataAttribute[data.pos_xml_data_value] = a.getValue();
        }
    }

    /**
     * Returns the qualified name of the attribute
     *
     * @param a
     *          an attribute event
     * @param enabledNamespaces
     *          indicates if namespaces should be added or not
     * @return the qualified name of the attribute
     */
    private String getAttributeName(Attribute a, boolean enabledNamespaces) {
        if (!enabledNamespaces) {
            return a.getName().getLocalPart();
        } else {
            return getName(a.getName().getPrefix(), a.getName().getLocalPart());
        }
    }

    /**
     * Returns the qualified name in the format: <code>prefix:localPart</code> if the prefix is present otherwise just
     * <code>localPart</code>
     *
     * @param prefix
     *          the namespace prefix part of the qualified name
     * @param localPart
     *          the local part of the qualified name
     * @return the qualified name
     */
    private String getName(String prefix, String localPart) {
        return (!Utils.isEmpty(prefix)) ? prefix + ":" + localPart : localPart;
    }

    private void resetElementCounters() {
        data.rowNumber = new Long(0);
        data.elementLevel = 0;
        data.elementID = new Long(0); // init value, could be parameterized later on
        data.elementLevelID = new Long[PARENT_ID_ALLOCATE_SIZE];
        data.elementLevelID[0] = data.elementID; // inital id for level 0
        data.elementParentID = new Long[PARENT_ID_ALLOCATE_SIZE];
        data.elementName = new String[PARENT_ID_ALLOCATE_SIZE];
        data.elementPath = new String[PARENT_ID_ALLOCATE_SIZE];
        data.elementPath[0] = ""; // initial empty
    }

    @Override
    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (XMLInputStreamMeta) smi;
        data = (XMLInputStreamData) sdi;

        if (super.init(smi, sdi)) {
            data.staxInstance = XMLInputFactory.newInstance(); // could select the parser later on
            data.filenr = 0;
            if (getTransMeta().findNrPrevSteps(getStepMeta()) == 0 && !meta.sourceFromInput) {
                String filename = environmentSubstitute(meta.getFilename());
                if (Utils.isEmpty(filename)) {
                    logError(BaseMessages.getString(PKG, "XMLInputStream.MissingFilename.Message"));
                    return false;
                }

                data.filenames = new String[] { filename, };
            } else {
                data.filenames = null;
            }

            data.nrRowsToSkip = Const.toLong(this.environmentSubstitute(meta.getNrRowsToSkip()), 0);
            data.rowLimit = Const.toLong(this.environmentSubstitute(meta.getRowLimit()), 0);
            data.encoding = this.environmentSubstitute(meta.getEncoding());

            data.outputRowMeta = new RowMeta();
            meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);

            // get and save field positions
            data.pos_xml_filename = data.outputRowMeta.indexOfValue(meta.getFilenameField());
            data.pos_xml_row_number = data.outputRowMeta.indexOfValue(meta.getRowNumberField());
            data.pos_xml_data_type_numeric = data.outputRowMeta.indexOfValue(meta.getXmlDataTypeNumericField());
            data.pos_xml_data_type_description = data.outputRowMeta
                    .indexOfValue(meta.getXmlDataTypeDescriptionField());
            data.pos_xml_location_line = data.outputRowMeta.indexOfValue(meta.getXmlLocationLineField());
            data.pos_xml_location_column = data.outputRowMeta.indexOfValue(meta.getXmlLocationColumnField());
            data.pos_xml_element_id = data.outputRowMeta.indexOfValue(meta.getXmlElementIDField());
            data.pos_xml_parent_element_id = data.outputRowMeta.indexOfValue(meta.getXmlParentElementIDField());
            data.pos_xml_element_level = data.outputRowMeta.indexOfValue(meta.getXmlElementLevelField());
            data.pos_xml_path = data.outputRowMeta.indexOfValue(meta.getXmlPathField());
            data.pos_xml_parent_path = data.outputRowMeta.indexOfValue(meta.getXmlParentPathField());
            data.pos_xml_data_name = data.outputRowMeta.indexOfValue(meta.getXmlDataNameField());
            data.pos_xml_data_value = data.outputRowMeta.indexOfValue(meta.getXmlDataValueField());
            return true;
        }
        return false;
    }

    @Override
    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (XMLInputStreamMeta) smi;
        data = (XMLInputStreamData) sdi;

        // free resources
        closeFile();

        data.staxInstance = null;

        super.dispose(smi, sdi);
    }

}