org.pentaho.di.trans.steps.getxmldata.GetXMLData.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.trans.steps.getxmldata.GetXMLData.java

Source

/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.trans.steps.getxmldata;

import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.http.client.methods.HttpGet;
import org.dom4j.Element;
import org.dom4j.ElementHandler;
import org.dom4j.ElementPath;
import org.dom4j.Namespace;
import org.dom4j.Node;
import org.dom4j.XPath;
import org.dom4j.io.SAXReader;
import org.dom4j.tree.AbstractNode;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.util.HttpClientManager;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.fileinput.FileInputList;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaFactory;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.core.xml.XMLParserFactoryProducer;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;

/**
 * Read XML files, parse them and convert them to rows and writes these to one or more output streams.
 * 
 * @author Samatar,Brahim
 * @since 20-06-2007
 */
public class GetXMLData extends BaseStep implements StepInterface {
    private static Class<?> PKG = GetXMLDataMeta.class; // for i18n purposes, needed by Translator2!!

    private GetXMLDataMeta meta;
    private GetXMLDataData data;
    private Object[] prevRow = null; // A pre-allocated spot for the previous row

    public GetXMLData(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    protected boolean setDocument(String StringXML, FileObject file, boolean IsInXMLField, boolean readurl)
            throws KettleException {

        this.prevRow = buildEmptyRow(); // pre-allocate previous row

        try {
            SAXReader reader = XMLParserFactoryProducer.getSAXReader(null);
            data.stopPruning = false;
            // Validate XML against specified schema?
            if (meta.isValidating()) {
                reader.setValidation(true);
                reader.setFeature("http://apache.org/xml/features/validation/schema", true);
            } else {
                // Ignore DTD declarations
                reader.setEntityResolver(new IgnoreDTDEntityResolver());
            }

            // Ignore comments?
            if (meta.isIgnoreComments()) {
                reader.setIgnoreComments(true);
            }

            if (data.prunePath != null) {
                // when pruning is on: reader.read() below will wait until all is processed in the handler
                if (log.isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.Activated"));
                }
                if (data.PathValue.equals(data.prunePath)) {
                    // Edge case, but if true, there will only ever be one item in the list
                    data.an = new ArrayList<>(1); // pre-allocate array and sizes
                    data.an.add(null);
                }
                reader.addHandler(data.prunePath, new ElementHandler() {
                    public void onStart(ElementPath path) {
                        // do nothing here...
                    }

                    public void onEnd(ElementPath path) {
                        if (isStopped()) {
                            // when a large file is processed and it should be stopped it is still reading the hole thing
                            // the only solution I see is to prune / detach the document and this will lead into a
                            // NPE or other errors depending on the parsing location - this will be treated in the catch part below
                            // any better idea is welcome
                            if (log.isBasic()) {
                                logBasic(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.Stopped"));
                            }
                            data.stopPruning = true;
                            path.getCurrent().getDocument().detach(); // trick to stop reader
                            return;
                        }

                        // process a ROW element
                        if (log.isDebug()) {
                            logDebug(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.StartProcessing"));
                        }
                        Element row = path.getCurrent();
                        try {
                            // Pass over the row instead of just the document. If
                            // if there's only one row, there's no need to
                            // go back to the whole document.
                            processStreaming(row);
                        } catch (Exception e) {
                            // catch the KettleException or others and forward to caller, e.g. when applyXPath() has a problem
                            throw new RuntimeException(e);
                        }
                        // prune the tree
                        row.detach();
                        if (log.isDebug()) {
                            logDebug(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.EndProcessing"));
                        }
                    }
                });
            }

            if (IsInXMLField) {
                // read string to parse
                data.document = reader.read(new StringReader(StringXML));
            } else if (readurl && KettleVFS.startsWithScheme(StringXML)) {
                data.document = reader.read(KettleVFS.getInputStream(StringXML));
            } else if (readurl) {
                // read url as source
                HttpClient client = HttpClientManager.getInstance().createDefaultClient();
                HttpGet method = new HttpGet(StringXML);
                method.addHeader("Accept-Encoding", "gzip");
                HttpResponse response = client.execute(method);
                Header contentEncoding = response.getFirstHeader("Content-Encoding");
                HttpEntity responseEntity = response.getEntity();
                if (responseEntity != null) {
                    if (contentEncoding != null) {
                        String acceptEncodingValue = contentEncoding.getValue();
                        if (acceptEncodingValue.contains("gzip")) {
                            GZIPInputStream in = new GZIPInputStream(responseEntity.getContent());

                            data.document = reader.read(in);
                        }
                    } else {
                        data.document = reader.read(responseEntity.getContent());
                    }
                }
            } else {
                // get encoding. By default UTF-8
                String encoding = "UTF-8";
                if (!Utils.isEmpty(meta.getEncoding())) {
                    encoding = meta.getEncoding();
                }
                InputStream is = KettleVFS.getInputStream(file);
                try {
                    data.document = reader.read(is, encoding);
                } finally {
                    BaseStep.closeQuietly(is);
                }
            }

            if (meta.isNamespaceAware()) {
                prepareNSMap(data.document.getRootElement());
            }
        } catch (Exception e) {
            if (data.stopPruning) {
                // ignore error when pruning
                return false;
            } else {
                throw new KettleException(e);
            }
        }
        return true;
    }

    /**
     * Process chunk of data in streaming mode. Called only by the handler when pruning is true. Not allowed in
     * combination with meta.getIsInFields(), but could be redesigned later on.
     * 
     */
    private void processStreaming(Element row) throws KettleException {
        data.document = row.getDocument();

        if (meta.isNamespaceAware()) {
            prepareNSMap(data.document.getRootElement());
        }
        if (log.isDebug()) {
            logDebug(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.ApplyXPath"));
        }
        // If the prune path and the path are the same, then
        // we're processing one row at a time through here.
        if (data.PathValue.equals(data.prunePath)) {
            data.an.set(0, (AbstractNode) row);
            data.nodesize = 1; // it's always just one row.
            data.nodenr = 0;
            if (log.isDebug()) {
                logDebug(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.ProcessingRows"));
            }
            Object[] r = getXMLRowPutRowWithErrorhandling();
            if (!data.errorInRowButContinue) { // do not put out the row but continue
                putRowOut(r); // false when limit is reached, functionality is there but we can not stop reading the hole file
                              // (slow but works)
            }
            data.nodesize = 0;
            data.nodenr = 0;
            return;
        } else {
            if (!applyXPath()) {
                throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.UnableApplyXPath"));
            }
        }
        // main loop through the data until limit is reached or transformation is stopped
        // similar functionality like in BaseStep.runStepThread
        if (log.isDebug()) {
            logDebug(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.ProcessingRows"));
        }
        boolean cont = true;
        while (data.nodenr < data.nodesize && cont && !isStopped()) {
            Object[] r = getXMLRowPutRowWithErrorhandling();
            if (data.errorInRowButContinue) {
                continue; // do not put out the row but continue
            }
            cont = putRowOut(r); // false when limit is reached, functionality is there but we can not stop reading the hole
                                 // file (slow but works)
        }
        if (log.isDebug()) {
            logDebug(BaseMessages.getString(PKG, "GetXMLData.Log.StreamingMode.FreeMemory"));
        }
        // free allocated memory
        data.an.clear();
        data.nodesize = data.an.size();
        data.nodenr = 0;
    }

    public void prepareNSMap(Element l) {
        @SuppressWarnings("unchecked")
        List<Namespace> namespacesList = l.declaredNamespaces();
        for (Namespace ns : namespacesList) {
            if (ns.getPrefix().trim().length() == 0) {
                data.NAMESPACE.put("pre" + data.NSPath.size(), ns.getURI());
                String path = "";
                Element element = l;
                while (element != null) {
                    if (element.getNamespacePrefix() != null && element.getNamespacePrefix().length() > 0) {
                        path = GetXMLDataMeta.N0DE_SEPARATOR + element.getNamespacePrefix() + ":"
                                + element.getName() + path;
                    } else {
                        path = GetXMLDataMeta.N0DE_SEPARATOR + element.getName() + path;
                    }
                    element = element.getParent();
                }
                data.NSPath.add(path);
            } else {
                data.NAMESPACE.put(ns.getPrefix(), ns.getURI());
            }
        }

        @SuppressWarnings("unchecked")
        List<Element> elementsList = l.elements();
        for (Element e : elementsList) {
            prepareNSMap(e);
        }
    }

    /**
     * Build an empty row based on the meta-data.
     * 
     * @return empty row built
     */
    private Object[] buildEmptyRow() {
        return RowDataUtil.allocateRowData(data.outputRowMeta.size());
    }

    private void handleMissingFiles() throws KettleException {
        List<FileObject> nonExistantFiles = data.files.getNonExistantFiles();
        if (nonExistantFiles.size() != 0) {
            String message = FileInputList.getRequiredFilesDescription(nonExistantFiles);
            logError(BaseMessages.getString(PKG, "GetXMLData.Log.RequiredFilesTitle"),
                    BaseMessages.getString(PKG, "GetXMLData.Log.RequiredFiles", message));

            throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.RequiredFilesMissing", message));
        }

        List<FileObject> nonAccessibleFiles = data.files.getNonAccessibleFiles();
        if (nonAccessibleFiles.size() != 0) {
            String message = FileInputList.getRequiredFilesDescription(nonAccessibleFiles);
            logError(BaseMessages.getString(PKG, "GetXMLData.Log.RequiredFilesTitle"),
                    BaseMessages.getString(PKG, "GetXMLData.Log.RequiredNotAccessibleFiles", message));

            throw new KettleException(
                    BaseMessages.getString(PKG, "GetXMLData.Log.RequiredNotAccessibleFilesMissing", message));
        }
    }

    private boolean ReadNextString() {

        try {
            // Grab another row ...
            data.readrow = getRow();

            if (data.readrow == null) {
                // finished processing!

                if (log.isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.FinishedProcessing"));
                }
                return false;
            }

            if (first) {
                first = false;

                data.nrReadRow = getInputRowMeta().size();
                data.inputRowMeta = getInputRowMeta();
                data.outputRowMeta = data.inputRowMeta.clone();
                meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);

                // Get total previous fields
                data.totalpreviousfields = data.inputRowMeta.size();

                // Create convert meta-data objects that will contain Date & Number formatters
                data.convertRowMeta = new RowMeta();
                for (ValueMetaInterface valueMeta : data.convertRowMeta.getValueMetaList()) {
                    data.convertRowMeta.addValueMeta(
                            ValueMetaFactory.cloneValueMeta(valueMeta, ValueMetaInterface.TYPE_STRING));
                }

                // For String to <type> conversions, we allocate a conversion meta data row as well...
                //
                data.convertRowMeta = data.outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);

                // Check is XML field is provided
                if (Utils.isEmpty(meta.getXMLField())) {
                    logError(BaseMessages.getString(PKG, "GetXMLData.Log.NoField"));
                    throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.NoField"));
                }

                // cache the position of the field
                if (data.indexOfXmlField < 0) {
                    data.indexOfXmlField = getInputRowMeta().indexOfValue(meta.getXMLField());
                    if (data.indexOfXmlField < 0) {
                        // The field is unreachable !
                        logError(BaseMessages.getString(PKG, "GetXMLData.Log.ErrorFindingField",
                                meta.getXMLField()));
                        throw new KettleException(BaseMessages.getString(PKG,
                                "GetXMLData.Exception.CouldnotFindField", meta.getXMLField()));
                    }
                }
            }

            if (meta.isInFields()) {
                // get XML field value
                String Fieldvalue = getInputRowMeta().getString(data.readrow, data.indexOfXmlField);

                if (log.isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.XMLStream", meta.getXMLField(),
                            Fieldvalue));
                }

                if (meta.getIsAFile()) {
                    FileObject file = null;
                    try {
                        // XML source is a file.
                        file = KettleVFS.getFileObject(environmentSubstitute(Fieldvalue), getTransMeta());

                        if (meta.isIgnoreEmptyFile() && file.getContent().getSize() == 0) {
                            logBasic(BaseMessages.getString(PKG, "GetXMLData.Error.FileSizeZero",
                                    "" + file.getName()));
                            return ReadNextString();
                        }

                        // Open the XML document
                        if (!setDocument(null, file, false, false)) {
                            throw new KettleException(
                                    BaseMessages.getString(PKG, "GetXMLData.Log.UnableCreateDocument"));
                        }

                        if (!applyXPath()) {
                            throw new KettleException(
                                    BaseMessages.getString(PKG, "GetXMLData.Log.UnableApplyXPath"));
                        }

                        addFileToResultFilesname(file);

                        if (log.isDetailed()) {
                            logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.LoopFileOccurences",
                                    "" + data.nodesize, file.getName().getBaseName()));
                        }

                    } catch (Exception e) {
                        throw new KettleException(e);
                    } finally {
                        try {
                            if (file != null) {
                                file.close();
                            }
                        } catch (Exception e) {
                            // Ignore close errors
                        }
                    }
                } else {
                    boolean url = false;
                    boolean xmltring = true;
                    if (meta.isReadUrl()) {
                        url = true;
                        xmltring = false;
                    }

                    // Open the XML document
                    if (!setDocument(Fieldvalue, null, xmltring, url)) {
                        throw new KettleException(
                                BaseMessages.getString(PKG, "GetXMLData.Log.UnableCreateDocument"));
                    }

                    // Apply XPath and set node list
                    if (!applyXPath()) {
                        throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.UnableApplyXPath"));
                    }
                    if (log.isDetailed()) {
                        logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.LoopFileOccurences",
                                "" + data.nodesize));
                    }
                }
            }
        } catch (Exception e) {
            logError(BaseMessages.getString(PKG, "GetXMLData.Log.UnexpectedError", e.toString()));
            stopAll();
            logError(Const.getStackTracker(e));
            setErrors(1);
            return false;
        }
        return true;

    }

    private void addFileToResultFilesname(FileObject file) throws Exception {
        if (meta.addResultFile()) {
            // Add this to the result file names...
            ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, file, getTransMeta().getName(),
                    getStepname());
            resultFile.setComment(BaseMessages.getString(PKG, "GetXMLData.Log.FileAddedResult"));
            addResultFile(resultFile);
        }
    }

    public String addNSPrefix(String path, String loopPath) {
        if (data.NSPath.size() > 0) {
            String fullPath = loopPath;
            if (!path.equals(fullPath)) {
                for (String tmp : path.split(GetXMLDataMeta.N0DE_SEPARATOR)) {
                    if (tmp.equals("..")) {
                        fullPath = fullPath.substring(0, fullPath.lastIndexOf(GetXMLDataMeta.N0DE_SEPARATOR));
                    } else {
                        fullPath += GetXMLDataMeta.N0DE_SEPARATOR + tmp;
                    }
                }
            }
            int[] indexs = new int[fullPath.split(GetXMLDataMeta.N0DE_SEPARATOR).length - 1];
            java.util.Arrays.fill(indexs, -1);
            int length = 0;
            for (int i = 0; i < data.NSPath.size(); i++) {
                if (data.NSPath.get(i).length() > length && fullPath.startsWith(data.NSPath.get(i))) {
                    java.util.Arrays.fill(indexs,
                            data.NSPath.get(i).split(GetXMLDataMeta.N0DE_SEPARATOR).length - 2, indexs.length, i);
                    length = data.NSPath.get(i).length();
                }
            }

            StringBuilder newPath = new StringBuilder();
            String[] pathStrs = path.split(GetXMLDataMeta.N0DE_SEPARATOR);
            for (int i = 0; i < pathStrs.length; i++) {
                String tmp = pathStrs[i];
                if (newPath.length() > 0) {
                    newPath.append(GetXMLDataMeta.N0DE_SEPARATOR);
                }
                if (tmp.length() > 0 && !tmp.contains(":") && !tmp.contains(".")
                        && !tmp.contains(GetXMLDataMeta.AT)) {
                    int index = indexs[i + indexs.length - pathStrs.length];
                    if (index >= 0) {
                        newPath.append("pre").append(index).append(":").append(tmp);
                    } else {
                        newPath.append(tmp);
                    }
                } else {
                    newPath.append(tmp);
                }
            }
            return newPath.toString();
        }
        return path;
    }

    @SuppressWarnings("unchecked")
    private boolean applyXPath() {
        try {
            XPath xpath = data.document.createXPath(data.PathValue);
            if (meta.isNamespaceAware()) {
                xpath = data.document.createXPath(addNSPrefix(data.PathValue, data.PathValue));
                xpath.setNamespaceURIs(data.NAMESPACE);
            }
            // get nodes list
            data.an = xpath.selectNodes(data.document);
            data.nodesize = data.an.size();
            data.nodenr = 0;
        } catch (Exception e) {
            logError(BaseMessages.getString(PKG, "GetXMLData.Log.ErrorApplyXPath", e.getMessage()));
            return false;
        }
        return true;
    }

    private boolean openNextFile() {
        try {
            if (data.filenr >= data.files.nrOfFiles()) {
                // finished processing!

                if (log.isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.FinishedProcessing"));
                }
                return false;
            }
            // get file
            data.file = data.files.getFile(data.filenr);
            data.filename = KettleVFS.getFilename(data.file);
            // Add additional fields?
            if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) {
                data.shortFilename = data.file.getName().getBaseName();
            }
            if (meta.getPathField() != null && meta.getPathField().length() > 0) {
                data.path = KettleVFS.getFilename(data.file.getParent());
            }
            if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) {
                data.hidden = data.file.isHidden();
            }
            if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) {
                data.extension = data.file.getName().getExtension();
            }
            if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) {
                data.lastModificationDateTime = new Date(data.file.getContent().getLastModifiedTime());
            }
            if (meta.getUriField() != null && meta.getUriField().length() > 0) {
                data.uriName = data.file.getName().getURI();
            }
            if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) {
                data.rootUriName = data.file.getName().getRootURI();
            }
            // Check if file is empty
            long fileSize;
            try {
                fileSize = data.file.getContent().getSize();
            } catch (FileSystemException e) {
                fileSize = -1;
            }

            if (meta.getSizeField() != null && meta.getSizeField().length() > 0) {
                data.size = fileSize;
            }
            // Move file pointer ahead!
            data.filenr++;

            if (meta.isIgnoreEmptyFile() && fileSize == 0) {
                // log only basic as a warning (was before logError)
                logBasic(BaseMessages.getString(PKG, "GetXMLData.Error.FileSizeZero", "" + data.file.getName()));
                openNextFile();

            } else {
                if (log.isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.OpeningFile", data.file.toString()));
                }

                // Open the XML document
                if (!setDocument(null, data.file, false, false)) {
                    if (data.stopPruning) {
                        return false; // ignore error when stopped while pruning
                    }
                    throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.UnableCreateDocument"));
                }

                // Apply XPath and set node list
                if (data.prunePath == null) { // this was already done in processStreaming()
                    if (!applyXPath()) {
                        throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.UnableApplyXPath"));
                    }
                }

                addFileToResultFilesname(data.file);

                if (log.isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.FileOpened", data.file.toString()));
                    logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.LoopFileOccurences", "" + data.nodesize,
                            data.file.getName().getBaseName()));
                }
            }
        } catch (Exception e) {
            logError(BaseMessages.getString(PKG, "GetXMLData.Log.UnableToOpenFile", "" + data.filenr,
                    data.file.toString(), e.toString()));
            stopAll();
            setErrors(1);
            return false;
        }
        return true;
    }

    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        if (first && !meta.isInFields()) {
            first = false;

            data.files = meta.getFiles(this);

            if (!meta.isdoNotFailIfNoFile() && data.files.nrOfFiles() == 0) {
                throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Log.NoFiles"));
            }

            handleMissingFiles();

            // Create the output row meta-data
            data.outputRowMeta = new RowMeta();

            meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);

            // Create convert meta-data objects that will contain Date & Number formatters
            // For String to <type> conversions, we allocate a conversion meta data row as well...
            //
            data.convertRowMeta = data.outputRowMeta.cloneToType(ValueMetaInterface.TYPE_STRING);
        }
        // Grab a row
        Object[] r = getXMLRow();
        if (data.errorInRowButContinue) {
            return true; // continue without putting the row out
        }
        if (r == null) {
            setOutputDone(); // signal end to receiver(s)
            return false; // end of data or error.
        }

        return putRowOut(r);

    }

    private boolean putRowOut(Object[] r) throws KettleException {
        if (log.isRowLevel()) {
            logRowlevel(BaseMessages.getString(PKG, "GetXMLData.Log.ReadRow", data.outputRowMeta.getString(r)));
        }
        incrementLinesInput();
        data.rownr++;
        putRow(data.outputRowMeta, r); // copy row to output rowset(s);

        if (meta.getRowLimit() > 0 && data.rownr > meta.getRowLimit()) {
            // limit has been reached: stop now.
            setOutputDone();
            return false;
        }

        return true;
    }

    private Object[] getXMLRow() throws KettleException {

        if (!meta.isInFields()) {
            while ((data.nodenr >= data.nodesize || data.file == null)) {
                if (!openNextFile()) {
                    data.errorInRowButContinue = false; // stop in all cases
                    return null;
                }
            }
        }
        return getXMLRowPutRowWithErrorhandling();
    }

    private Object[] getXMLRowPutRowWithErrorhandling() throws KettleException {
        // Build an empty row based on the meta-data
        Object[] r;
        data.errorInRowButContinue = false;
        try {
            if (meta.isInFields()) {
                while ((data.nodenr >= data.nodesize || data.readrow == null)) {
                    if (!ReadNextString()) {
                        return null;
                    }
                    if (data.readrow == null) {
                        return null;
                    }
                }
            }

            r = processPutRow(data.an.get(data.nodenr));
        } catch (Exception e) {
            throw new KettleException(BaseMessages.getString(PKG, "GetXMLData.Error.UnableReadFile"), e);
        }

        return r;
    }

    private Object[] processPutRow(Node node) throws KettleException {
        // Create new row...
        Object[] outputRowData = buildEmptyRow();

        // Create new row or clone
        if (meta.isInFields()) {
            System.arraycopy(data.readrow, 0, outputRowData, 0, data.nrReadRow);
        }
        try {
            data.nodenr++;

            // Read fields...
            for (int i = 0; i < data.nrInputFields; i++) {
                // Get field
                GetXMLDataField xmlDataField = meta.getInputFields()[i];
                // Get the Path to look for
                String XPathValue = xmlDataField.getResolvedXPath();

                if (meta.isuseToken()) {
                    // See if user use Token inside path field
                    // The syntax is : @_Fieldname-
                    // PDI will search for Fieldname value and replace it
                    // Fieldname must be defined before the current node
                    XPathValue = substituteToken(XPathValue, outputRowData);
                    if (isDetailed()) {
                        logDetailed(XPathValue);
                    }
                }

                // Get node value
                String nodevalue;

                // Handle namespaces
                if (meta.isNamespaceAware()) {
                    XPath xpathField = node.createXPath(addNSPrefix(XPathValue, data.PathValue));
                    xpathField.setNamespaceURIs(data.NAMESPACE);
                    if (xmlDataField.getResultType() == GetXMLDataField.RESULT_TYPE_VALUE_OF) {
                        nodevalue = xpathField.valueOf(node);
                    } else {
                        // nodevalue=xpathField.selectSingleNode(node).asXML();
                        Node n = xpathField.selectSingleNode(node);
                        if (n != null) {
                            nodevalue = n.asXML();
                        } else {
                            nodevalue = "";
                        }
                    }
                } else {
                    if (xmlDataField.getResultType() == GetXMLDataField.RESULT_TYPE_VALUE_OF) {
                        nodevalue = node.valueOf(XPathValue);
                    } else {
                        // nodevalue=node.selectSingleNode(XPathValue).asXML();
                        Node n = node.selectSingleNode(XPathValue);
                        if (n != null) {
                            nodevalue = n.asXML();
                        } else {
                            nodevalue = "";
                        }
                    }
                }

                // Do trimming
                switch (xmlDataField.getTrimType()) {
                case GetXMLDataField.TYPE_TRIM_LEFT:
                    nodevalue = Const.ltrim(nodevalue);
                    break;
                case GetXMLDataField.TYPE_TRIM_RIGHT:
                    nodevalue = Const.rtrim(nodevalue);
                    break;
                case GetXMLDataField.TYPE_TRIM_BOTH:
                    nodevalue = Const.trim(nodevalue);
                    break;
                default:
                    break;
                }

                // Do conversions
                //
                ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i);
                ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i);
                outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta,
                        nodevalue);

                // Do we need to repeat this field if it is null?
                if (meta.getInputFields()[i].isRepeated()) {
                    if (data.previousRow != null && Utils.isEmpty(nodevalue)) {
                        outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields
                                + i];
                    }
                }
            } // End of loop over fields...

            int rowIndex = data.totalpreviousfields + data.nrInputFields;

            // See if we need to add the filename to the row...
            if (meta.includeFilename() && !Utils.isEmpty(meta.getFilenameField())) {
                outputRowData[rowIndex++] = data.filename;
            }
            // See if we need to add the row number to the row...
            if (meta.includeRowNumber() && !Utils.isEmpty(meta.getRowNumberField())) {
                outputRowData[rowIndex++] = data.rownr;
            }
            // Possibly add short filename...
            if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) {
                outputRowData[rowIndex++] = data.shortFilename;
            }
            // Add Extension
            if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) {
                outputRowData[rowIndex++] = data.extension;
            }
            // add path
            if (meta.getPathField() != null && meta.getPathField().length() > 0) {
                outputRowData[rowIndex++] = data.path;
            }
            // Add Size
            if (meta.getSizeField() != null && meta.getSizeField().length() > 0) {
                outputRowData[rowIndex++] = data.size;
            }
            // add Hidden
            if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) {
                outputRowData[rowIndex++] = Boolean.valueOf(data.path);
            }
            // Add modification date
            if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) {
                outputRowData[rowIndex++] = data.lastModificationDateTime;
            }
            // Add Uri
            if (meta.getUriField() != null && meta.getUriField().length() > 0) {
                outputRowData[rowIndex++] = data.uriName;
            }
            // Add RootUri
            if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) {
                outputRowData[rowIndex] = data.rootUriName;
            }

            RowMetaInterface irow = getInputRowMeta();

            if (irow == null) {
                data.previousRow = outputRowData;
            } else {
                // clone to previously allocated array to make sure next step doesn't
                // change it in between...
                System.arraycopy(outputRowData, 0, this.prevRow, 0, outputRowData.length);
                // Pick up everything else that needs a real deep clone
                data.previousRow = irow.cloneRow(outputRowData, this.prevRow);
            }
        } catch (Exception e) {
            if (getStepMeta().isDoingErrorHandling()) {
                // Simply add this row to the error row
                putError(data.outputRowMeta, outputRowData, 1, e.toString(), null, "GetXMLData001");
                data.errorInRowButContinue = true;
                return null;
            } else {
                logError(e.toString());
                throw new KettleException(e.toString());
            }
        }
        return outputRowData;
    }

    public String substituteToken(String aString, Object[] outputRowData) {
        if (aString == null) {
            return null;
        }

        StringBuilder buffer = new StringBuilder();

        String rest = aString;

        // search for closing string
        int i = rest.indexOf(data.tokenStart);
        while (i > -1) {
            int j = rest.indexOf(data.tokenEnd, i + data.tokenStart.length());
            // search for closing string
            if (j > -1) {
                String varName = rest.substring(i + data.tokenStart.length(), j);
                Object Value = varName;

                for (int k = 0; k < data.nrInputFields; k++) {
                    GetXMLDataField Tmp_xmlInputField = meta.getInputFields()[k];
                    if (Tmp_xmlInputField.getName().equalsIgnoreCase(varName)) {
                        Value = "'" + outputRowData[data.totalpreviousfields + k] + "'";
                    }
                }
                buffer.append(rest.substring(0, i));
                buffer.append(Value);
                rest = rest.substring(j + data.tokenEnd.length());
            } else {
                // no closing tag found; end the search
                buffer.append(rest);
                rest = "";
            }
            // keep searching
            i = rest.indexOf(data.tokenEnd);
        }
        buffer.append(rest);
        return buffer.toString();
    }

    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (GetXMLDataMeta) smi;
        data = (GetXMLDataData) sdi;

        if (super.init(smi, sdi)) {
            data.rownr = 1L;
            data.nrInputFields = meta.getInputFields().length;

            // correct attribute path if needed
            // do it once
            for (int i = 0; i < data.nrInputFields; i++) {
                GetXMLDataField xmlDataField = meta.getInputFields()[i];
                // Resolve variable substitution
                String XPathValue = environmentSubstitute(xmlDataField.getXPath());
                if (xmlDataField.getElementType() == GetXMLDataField.ELEMENT_TYPE_ATTRIBUT) {
                    // We have an attribute
                    // do we need to add leading @?
                    // Only put @ to the last element in path, not in front at all
                    int last = XPathValue.lastIndexOf(GetXMLDataMeta.N0DE_SEPARATOR);
                    if (last > -1) {
                        last++;
                        String attribut = XPathValue.substring(last, XPathValue.length());
                        if (!attribut.startsWith(GetXMLDataMeta.AT)) {
                            XPathValue = XPathValue.substring(0, last) + GetXMLDataMeta.AT + attribut;
                        }
                    } else {
                        if (!XPathValue.startsWith(GetXMLDataMeta.AT)) {
                            XPathValue = GetXMLDataMeta.AT + XPathValue;
                        }
                    }
                }
                xmlDataField.setResolvedXPath(XPathValue);
            }

            data.PathValue = environmentSubstitute(meta.getLoopXPath());
            if (Utils.isEmpty(data.PathValue)) {
                logError(BaseMessages.getString(PKG, "GetXMLData.Error.EmptyPath"));
                return false;
            }
            if (!data.PathValue.substring(0, 1).equals(GetXMLDataMeta.N0DE_SEPARATOR)) {
                data.PathValue = GetXMLDataMeta.N0DE_SEPARATOR + data.PathValue;
            }
            if (log.isDetailed()) {
                logDetailed(BaseMessages.getString(PKG, "GetXMLData.Log.LoopXPath", data.PathValue));
            }

            data.prunePath = environmentSubstitute(meta.getPrunePath());
            if (data.prunePath != null) {
                if (Utils.isEmpty(data.prunePath.trim())) {
                    data.prunePath = null;
                } else {
                    // ensure a leading slash
                    if (!data.prunePath.startsWith(GetXMLDataMeta.N0DE_SEPARATOR)) {
                        data.prunePath = GetXMLDataMeta.N0DE_SEPARATOR + data.prunePath;
                    }
                    // check if other conditions apply that do not allow pruning
                    if (meta.isInFields()) {
                        data.prunePath = null; // not possible by design, could be changed later on
                    }
                }
            }

            return true;
        }
        return false;
    }

    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (GetXMLDataMeta) smi;
        data = (GetXMLDataData) sdi;
        if (data.file != null) {
            try {
                data.file.close();
            } catch (Exception e) {
                // Ignore close errors
            }
        }
        if (data.an != null) {
            data.an.clear();
            data.an = null;
        }
        if (data.NAMESPACE != null) {
            data.NAMESPACE.clear();
            data.NAMESPACE = null;
        }
        if (data.NSPath != null) {
            data.NSPath.clear();
            data.NSPath = null;
        }
        if (data.readrow != null) {
            data.readrow = null;
        }
        if (data.document != null) {
            data.document = null;
        }
        if (data.fr != null) {
            BaseStep.closeQuietly(data.fr);
        }
        if (data.is != null) {
            BaseStep.closeQuietly(data.is);
        }
        if (data.files != null) {
            data.files = null;
        }
        super.dispose(smi, sdi);
    }

}