Java tutorial
/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.xmlinputstream; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.Attribute; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.Namespace; import javax.xml.stream.events.XMLEvent; import org.apache.commons.vfs2.FileSystemException; import org.pentaho.di.core.Const; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Use a StAX parser to read XML in a flexible and fast way. * * @author Jens Bleuel * @since 2011-01-13 */ // TODO black box testing public class XMLInputStream extends BaseStep implements StepInterface { private static Class<?> PKG = XMLInputStream.class; // for i18n purposes, needed by Translator2!! private static int PARENT_ID_ALLOCATE_SIZE = 1000; // max. number of nested elements, we may let the user configure // this private XMLInputStreamMeta meta; private XMLInputStreamData data; private int inputFieldIndex; static final String[] eventDescription = { "UNKNOWN", "START_ELEMENT", "END_ELEMENT", "PROCESSING_INSTRUCTION", "CHARACTERS", "COMMENT", "SPACE", "START_DOCUMENT", "END_DOCUMENT", "ENTITY_REFERENCE", "ATTRIBUTE", "DTD", "CDATA", "NAMESPACE", "NOTATION_DECLARATION", "ENTITY_DECLARATION" }; public XMLInputStream(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { super(stepMeta, stepDataInterface, copyNr, transMeta, trans); } @Override public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { if (first && !meta.sourceFromInput) { first = false; if (data.filenames == null) { getFilenamesFromPreviousSteps(); } openNextFile(); resetElementCounters(); } Object[] outputRowData; if (meta.sourceFromInput) { Object[] row = null; if (first) { first = false; row = getRow(); // get input field index if (getInputRowMeta() == null) { throw new KettleException(BaseMessages.getString(PKG, "XMLInputStream.NoIncomingRowsFound")); } inputFieldIndex = getInputRowMeta().indexOfValue(meta.sourceFieldName); if (inputFieldIndex < 0) { throw new KettleException(BaseMessages.getString(PKG, "XMLInputStream.FilenameFieldNotFound", meta.sourceFieldName)); } } if (data.xmlEventReader == null) { if (row == null) { row = getRow(); } if (row == null) { setOutputDone(); // signal end to receiver(s) return false; // This is the end of this step. } String xml = getInputRowMeta().getString(row, inputFieldIndex); try { data.xmlEventReader = data.staxInstance.createXMLEventReader(new StringReader(xml)); } catch (XMLStreamException e) { throw new KettleException(e); } resetElementCounters(); } outputRowData = getRowFromXML(); if (outputRowData == null) { data.xmlEventReader = null; return true; } } else { outputRowData = getRowFromXML(); if (outputRowData == null) { if (openNextFile()) { resetElementCounters(); return true; } else { setOutputDone(); // signal end to receiver(s) return false; // This is the end of this step. } } } putRowOut(outputRowData); // limit has been reached: stop now. (not exact science since some attributes could be mixed within the last row) if (data.rowLimit > 0 && data.rowNumber >= data.rowLimit) { setOutputDone(); return false; } return true; } private boolean openNextFile() throws KettleException { try { closeFile(); if (data.filenr >= data.filenames.length) { return false; } data.fileObject = KettleVFS.getFileObject(data.filenames[data.filenr], getTransMeta()); data.inputStream = KettleVFS.getInputStream(data.fileObject); data.xmlEventReader = data.staxInstance.createXMLEventReader(data.inputStream, data.encoding); } catch (IOException e) { throw new KettleException(e); } catch (XMLStreamException e) { throw new KettleException(e); } data.filenr++; if (meta.isAddResultFile()) { // Add this to the result file names... ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, data.fileObject, getTransMeta().getName(), getStepname()); resultFile.setComment(BaseMessages.getString(PKG, "XMLInputStream.Log.ResultFileWasRead")); addResultFile(resultFile); } return true; } private void closeFile() { if (data.xmlEventReader != null) { try { data.xmlEventReader.close(); } catch (XMLStreamException e) { if (log.isBasic()) { log.logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.UnableToCloseFile", data.filenames[(data.filenr - 1)]), e); } } } if (data.inputStream != null) { try { data.inputStream.close(); } catch (IOException e) { if (log.isBasic()) { log.logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.UnableToCloseFile", data.filenames[(data.filenr - 1)]), e); } } } if (data.fileObject != null) { try { data.fileObject.close(); } catch (FileSystemException e) { if (log.isBasic()) { log.logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.UnableToCloseFile", data.filenames[(data.filenr - 1)]), e); } } } } private void getFilenamesFromPreviousSteps() throws KettleException { List<String> filenames = new ArrayList<String>(); int index = -1; Object[] row = getRow(); // Get the filename field index... // String filenameField = environmentSubstitute(meta.getFilename()); index = getInputRowMeta().indexOfValue(filenameField); if (index < 0) { throw new KettleException( BaseMessages.getString(PKG, "XMLInputStream.FilenameFieldNotFound", filenameField)); } while (row != null) { String filename = getInputRowMeta().getString(row, index); filenames.add(filename); // add it to the list... row = getRow(); // Grab another row... } data.filenames = filenames.toArray(new String[filenames.size()]); logDetailed(BaseMessages.getString(PKG, "XMLInputStream.Log.ReadingFromNrFiles", Integer.toString(data.filenames.length))); } // sends the normal row and attributes private void putRowOut(Object[] r) throws KettleStepException, KettleValueException { data.rowNumber++; if (data.pos_xml_filename != -1) { r[data.pos_xml_filename] = new String(data.filenames[(data.filenr - 1)]); } if (data.pos_xml_row_number != -1) { r[data.pos_xml_row_number] = new Long(data.rowNumber); } if (data.pos_xml_element_id != -1) { r[data.pos_xml_element_id] = data.elementLevelID[data.elementLevel]; } if (data.pos_xml_element_level != -1) { r[data.pos_xml_element_level] = new Long(data.elementLevel); } if (data.pos_xml_parent_element_id != -1) { r[data.pos_xml_parent_element_id] = data.elementParentID[data.elementLevel]; } if (data.pos_xml_path != -1) { r[data.pos_xml_path] = data.elementPath[data.elementLevel]; } if (data.pos_xml_parent_path != -1 && data.elementLevel > 0) { r[data.pos_xml_parent_path] = data.elementPath[data.elementLevel - 1]; } // We could think of adding an option to filter Start_end Document / Elements, RegEx? // We could think of adding columns identifying Element-Blocks // Skip rows? (not exact science since some attributes could be mixed within the last row) if (data.nrRowsToSkip == 0 || data.rowNumber > data.nrRowsToSkip) { if (log.isRowLevel()) { logRowlevel("Read row: " + data.outputRowMeta.getString(r)); } putRow(data.outputRowMeta, r); } } private Object[] getRowFromXML() throws KettleException { Object[] outputRowData = null; // loop until significant data is there and more data is there while (data.xmlEventReader.hasNext() && outputRowData == null && !isStopped()) { outputRowData = processEvent(); // log all events (but no attributes sent by the EventReader) incrementLinesInput(); if (checkFeedback(getLinesInput()) && isBasic()) { logBasic(BaseMessages.getString(PKG, "XMLInputStream.Log.LineNumber", Long.toString(getLinesInput()))); } } return outputRowData; } private Object[] processEvent() throws KettleException { Object[] outputRowData = RowDataUtil.allocateRowData(data.outputRowMeta.size()); XMLEvent e = null; try { e = data.xmlEventReader.nextEvent(); } catch (XMLStreamException ex) { throw new KettleException(ex); } int eventType = e.getEventType(); if (data.pos_xml_data_type_numeric != -1) { outputRowData[data.pos_xml_data_type_numeric] = new Long(eventType); } if (data.pos_xml_data_type_description != -1) { if (eventType == 0 || eventType > eventDescription.length) { // unknown eventType outputRowData[data.pos_xml_data_type_description] = eventDescription[0] + "(" + eventType + ")"; } else { outputRowData[data.pos_xml_data_type_description] = eventDescription[eventType]; } } if (data.pos_xml_location_line != -1) { outputRowData[data.pos_xml_location_line] = new Long(e.getLocation().getLineNumber()); } if (data.pos_xml_location_column != -1) { outputRowData[data.pos_xml_location_column] = new Long(e.getLocation().getColumnNumber()); } switch (eventType) { case XMLStreamConstants.START_ELEMENT: data.elementLevel++; if (data.elementLevel > PARENT_ID_ALLOCATE_SIZE - 1) { throw new KettleException(BaseMessages.getString(PKG, "XMLInputStream.Log.TooManyNestedElements", PARENT_ID_ALLOCATE_SIZE)); } if (data.elementParentID[data.elementLevel] == null) { data.elementParentID[data.elementLevel] = data.elementID; } data.elementID++; data.elementLevelID[data.elementLevel] = data.elementID; String xml_data_name; if (meta.isEnableNamespaces()) { String prefix = e.asStartElement().getName().getPrefix(); if (Utils.isEmpty(prefix)) { xml_data_name = e.asStartElement().getName().getLocalPart(); } else { // add namespace prefix: xml_data_name = prefix + ":" + e.asStartElement().getName().getLocalPart(); } } else { xml_data_name = e.asStartElement().getName().getLocalPart(); } if (data.pos_xml_data_name >= 0) { outputRowData[data.pos_xml_data_name] = xml_data_name; } // store the name data.elementName[data.elementLevel] = xml_data_name; // store simple path data.elementPath[data.elementLevel] = data.elementPath[data.elementLevel - 1] + "/" + xml_data_name; // write Namespaces out if (meta.isEnableNamespaces()) { outputRowData = parseNamespaces(outputRowData, e); } // write Attributes out outputRowData = parseAttributes(outputRowData, e); break; case XMLStreamConstants.END_ELEMENT: parseEndElement(outputRowData, e.asEndElement()); putRowOut(outputRowData); data.elementParentID[data.elementLevel + 1] = null; data.elementLevel--; outputRowData = null; // continue break; case XMLStreamConstants.SPACE: outputRowData = null; // ignore & continue break; case XMLStreamConstants.CHARACTERS: case XMLStreamConstants.CDATA: if (data.pos_xml_data_name >= 0) { outputRowData[data.pos_xml_data_name] = data.elementName[data.elementLevel]; } String xml_data_value = e.asCharacters().getData(); if (data.pos_xml_data_value >= 0) { if (meta.isEnableTrim()) { // optional trim is also eliminating white spaces, tab, cr, lf xml_data_value = Const.trim(xml_data_value); } outputRowData[data.pos_xml_data_value] = xml_data_value; } if (data.pos_xml_data_value < 0 || Utils.isEmpty((String) outputRowData[data.pos_xml_data_value])) { outputRowData = null; // ignore & continue } break; case XMLStreamConstants.PROCESSING_INSTRUCTION: outputRowData = null; // ignore & continue // TODO test if possible break; case XMLStreamConstants.COMMENT: outputRowData = null; // ignore & continue // TODO test if possible break; case XMLStreamConstants.ENTITY_REFERENCE: // should be resolved by default outputRowData = null; // ignore & continue break; case XMLStreamConstants.START_DOCUMENT: // just get this information out break; case XMLStreamConstants.END_DOCUMENT: // just get this information out break; default: logBasic("Event:" + eventType); outputRowData = null; // ignore & continue } return outputRowData; } private void parseEndElement(Object[] outputRowData, EndElement el) { if (data.pos_xml_data_name >= 0) { outputRowData[data.pos_xml_data_name] = getEndElementName(el, meta.isEnableNamespaces()); } } /** * Returns the qualified name of the end element * * @param el * an EndElement event * @param enabledNamespaces * indicates if namespaces should be added or not * @return the qualified name of the end element */ private String getEndElementName(EndElement el, boolean enabledNamespaces) { if (!enabledNamespaces) { return el.getName().getLocalPart(); } else { return getName(el.getName().getPrefix(), el.getName().getLocalPart()); } } // Namespaces: put an extra row out for each namespace @SuppressWarnings("unchecked") private Object[] parseNamespaces(Object[] outputRowData, XMLEvent e) throws KettleValueException, KettleStepException { Iterator<Namespace> iter = e.asStartElement().getNamespaces(); if (iter.hasNext()) { Object[] outputRowDataNamespace = data.outputRowMeta.cloneRow(outputRowData); putRowOut(outputRowDataNamespace); // first put the element name info out // change data_type to ATTRIBUTE if (data.pos_xml_data_type_numeric != -1) { outputRowData[data.pos_xml_data_type_numeric] = new Long(XMLStreamConstants.NAMESPACE); } if (data.pos_xml_data_type_description != -1) { outputRowData[data.pos_xml_data_type_description] = eventDescription[XMLStreamConstants.NAMESPACE]; } } while (iter.hasNext()) { Object[] outputRowDataNamespace = data.outputRowMeta.cloneRow(outputRowData); Namespace n = iter.next(); outputRowDataNamespace[data.pos_xml_data_name] = n.getPrefix(); outputRowDataNamespace[data.pos_xml_data_value] = n.getNamespaceURI(); if (iter.hasNext()) { // send out the Namespace row putRowOut(outputRowDataNamespace); } else { // last row: this will be sent out by the outer loop outputRowData = outputRowDataNamespace; } } return outputRowData; } // Attributes: put an extra row out for each attribute @SuppressWarnings("unchecked") private Object[] parseAttributes(Object[] outputRowData, XMLEvent e) throws KettleValueException, KettleStepException { Iterator<Attribute> iter = e.asStartElement().getAttributes(); if (iter.hasNext()) { Object[] outputRowDataAttribute = data.outputRowMeta.cloneRow(outputRowData); putRowOut(outputRowDataAttribute); // first put the element name (or namespace) info out // change data_type to ATTRIBUTE if (data.pos_xml_data_type_numeric != -1) { outputRowData[data.pos_xml_data_type_numeric] = new Long(XMLStreamConstants.ATTRIBUTE); } if (data.pos_xml_data_type_description != -1) { outputRowData[data.pos_xml_data_type_description] = eventDescription[XMLStreamConstants.ATTRIBUTE]; } } while (iter.hasNext()) { Object[] outputRowDataAttribute = data.outputRowMeta.cloneRow(outputRowData); Attribute a = iter.next(); parseAttribute(outputRowDataAttribute, a, meta.isEnableNamespaces()); if (iter.hasNext()) { // send out the Attribute row putRowOut(outputRowDataAttribute); } else { // last row: this will be sent out by the outer loop outputRowData = outputRowDataAttribute; } } return outputRowData; } private void parseAttribute(Object[] outputRowDataAttribute, Attribute a, boolean enabledNamespaces) { if (data.pos_xml_data_name != -1) { outputRowDataAttribute[data.pos_xml_data_name] = getAttributeName(a, enabledNamespaces); } if (data.pos_xml_data_value != -1) { outputRowDataAttribute[data.pos_xml_data_value] = a.getValue(); } } /** * Returns the qualified name of the attribute * * @param a * an attribute event * @param enabledNamespaces * indicates if namespaces should be added or not * @return the qualified name of the attribute */ private String getAttributeName(Attribute a, boolean enabledNamespaces) { if (!enabledNamespaces) { return a.getName().getLocalPart(); } else { return getName(a.getName().getPrefix(), a.getName().getLocalPart()); } } /** * Returns the qualified name in the format: <code>prefix:localPart</code> if the prefix is present otherwise just * <code>localPart</code> * * @param prefix * the namespace prefix part of the qualified name * @param localPart * the local part of the qualified name * @return the qualified name */ private String getName(String prefix, String localPart) { return (!Utils.isEmpty(prefix)) ? prefix + ":" + localPart : localPart; } private void resetElementCounters() { data.rowNumber = new Long(0); data.elementLevel = 0; data.elementID = new Long(0); // init value, could be parameterized later on data.elementLevelID = new Long[PARENT_ID_ALLOCATE_SIZE]; data.elementLevelID[0] = data.elementID; // inital id for level 0 data.elementParentID = new Long[PARENT_ID_ALLOCATE_SIZE]; data.elementName = new String[PARENT_ID_ALLOCATE_SIZE]; data.elementPath = new String[PARENT_ID_ALLOCATE_SIZE]; data.elementPath[0] = ""; // initial empty } @Override public boolean init(StepMetaInterface smi, StepDataInterface sdi) { meta = (XMLInputStreamMeta) smi; data = (XMLInputStreamData) sdi; if (super.init(smi, sdi)) { data.staxInstance = XMLInputFactory.newInstance(); // could select the parser later on data.filenr = 0; if (getTransMeta().findNrPrevSteps(getStepMeta()) == 0 && !meta.sourceFromInput) { String filename = environmentSubstitute(meta.getFilename()); if (Utils.isEmpty(filename)) { logError(BaseMessages.getString(PKG, "XMLInputStream.MissingFilename.Message")); return false; } data.filenames = new String[] { filename, }; } else { data.filenames = null; } data.nrRowsToSkip = Const.toLong(this.environmentSubstitute(meta.getNrRowsToSkip()), 0); data.rowLimit = Const.toLong(this.environmentSubstitute(meta.getRowLimit()), 0); data.encoding = this.environmentSubstitute(meta.getEncoding()); data.outputRowMeta = new RowMeta(); meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore); // get and save field positions data.pos_xml_filename = data.outputRowMeta.indexOfValue(meta.getFilenameField()); data.pos_xml_row_number = data.outputRowMeta.indexOfValue(meta.getRowNumberField()); data.pos_xml_data_type_numeric = data.outputRowMeta.indexOfValue(meta.getXmlDataTypeNumericField()); data.pos_xml_data_type_description = data.outputRowMeta .indexOfValue(meta.getXmlDataTypeDescriptionField()); data.pos_xml_location_line = data.outputRowMeta.indexOfValue(meta.getXmlLocationLineField()); data.pos_xml_location_column = data.outputRowMeta.indexOfValue(meta.getXmlLocationColumnField()); data.pos_xml_element_id = data.outputRowMeta.indexOfValue(meta.getXmlElementIDField()); data.pos_xml_parent_element_id = data.outputRowMeta.indexOfValue(meta.getXmlParentElementIDField()); data.pos_xml_element_level = data.outputRowMeta.indexOfValue(meta.getXmlElementLevelField()); data.pos_xml_path = data.outputRowMeta.indexOfValue(meta.getXmlPathField()); data.pos_xml_parent_path = data.outputRowMeta.indexOfValue(meta.getXmlParentPathField()); data.pos_xml_data_name = data.outputRowMeta.indexOfValue(meta.getXmlDataNameField()); data.pos_xml_data_value = data.outputRowMeta.indexOfValue(meta.getXmlDataValueField()); return true; } return false; } @Override public void dispose(StepMetaInterface smi, StepDataInterface sdi) { meta = (XMLInputStreamMeta) smi; data = (XMLInputStreamData) sdi; // free resources closeFile(); data.staxInstance = null; super.dispose(smi, sdi); } }