org.codelibs.robot.transformer.impl.XmlTransformer.java Source code

Java tutorial

Introduction

Here is the source code for org.codelibs.robot.transformer.impl.XmlTransformer.java

Source

/*
 * Copyright 2012-2015 CodeLibs Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.codelibs.robot.transformer.impl;

import java.io.File;
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerException;

import org.apache.commons.io.IOUtils;
import org.apache.xml.utils.PrefixResolverDefault;
import org.apache.xpath.CachedXPathAPI;
import org.codelibs.core.beans.util.BeanUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.robot.Constants;
import org.codelibs.robot.entity.AccessResultData;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.entity.ResultData;
import org.codelibs.robot.exception.RobotCrawlAccessException;
import org.codelibs.robot.exception.RobotSystemException;
import org.codelibs.robot.util.ResponseDataUtil;
import org.codelibs.robot.util.XmlUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * @author shinsuke
 *
 */
public class XmlTransformer extends AbstractTransformer {
    private static final Logger logger = LoggerFactory // NOPMD
            .getLogger(XmlTransformer.class);

    private static final Pattern SPACE_PATTERN = Pattern.compile("\\s+", Pattern.MULTILINE);

    private boolean namespaceAware;

    private boolean coalescing;

    private boolean expandEntityRef = true;

    private boolean ignoringComments;

    private boolean ignoringElementContentWhitespace;

    private boolean validating;

    private boolean includeAware;

    private final Map<String, Object> attributeMap = new HashMap<String, Object>();

    private final Map<String, String> featureMap = new HashMap<String, String>();

    protected Map<String, String> fieldRuleMap = new LinkedHashMap<String, String>();

    /** a flag to trim a space characters. */
    protected boolean trimSpace = true;

    protected String charsetName = Constants.UTF_8;

    /**
     * Class type returned by getData() method. The default is null(XML content
     * of String).
     */
    protected Class<?> dataClass = null;

    private final ThreadLocal<CachedXPathAPI> xpathAPI = new ThreadLocal<CachedXPathAPI>();

    /**
     * Returns data as XML content of String.
     *
     * @return XML content of String.
     */
    @Override
    public Object getData(final AccessResultData<?> accessResultData) {
        if (dataClass == null) {
            // check transformer name
            if (!getName().equals(accessResultData.getTransformerName())) {
                throw new RobotSystemException("Transformer is invalid. Use "
                        + accessResultData.getTransformerName() + ". This transformer is " + getName() + ".");
            }

            final byte[] data = accessResultData.getData();
            if (data == null) {
                return null;
            }
            final String encoding = accessResultData.getEncoding();
            try {
                return new String(data, encoding == null ? Constants.UTF_8 : encoding);
            } catch (final UnsupportedEncodingException e) {
                if (logger.isInfoEnabled()) {
                    logger.info("Invalid charsetName: " + encoding + ". Changed to " + Constants.UTF_8, e);
                }
                return new String(data, Constants.UTF_8_CHARSET);
            }
        }

        final Map<String, Object> dataMap = XmlUtil.getDataMap(accessResultData);
        if (Map.class.equals(dataClass)) {
            return dataMap;
        }

        try {
            final Object obj = dataClass.newInstance();
            BeanUtil.copyMapToBean(dataMap, obj);
            return obj;
        } catch (final Exception e) {
            throw new RobotSystemException("Could not create/copy a data map to " + dataClass, e);
        }
    }

    /*
     * (non-Javadoc)
     *
     * @see org.codelibs.robot.transformer.impl.AbstractTransformer#transform(org.seasar.robot.entity.ResponseData)
     */
    @Override
    public ResultData transform(final ResponseData responseData) {
        if (responseData == null || responseData.getResponseBody() == null) {
            throw new RobotCrawlAccessException("No response body.");
        }

        final File tempFile = ResponseDataUtil.createResponseBodyFile(responseData);

        FileInputStream fis = null;

        try {
            fis = new FileInputStream(tempFile);
            final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

            for (final Map.Entry<String, Object> entry : attributeMap.entrySet()) {
                factory.setAttribute(entry.getKey(), entry.getValue());
            }

            for (final Map.Entry<String, String> entry : featureMap.entrySet()) {
                factory.setFeature(entry.getKey(), "true".equalsIgnoreCase(entry.getValue()));
            }

            factory.setCoalescing(coalescing);
            factory.setExpandEntityReferences(expandEntityRef);
            factory.setIgnoringComments(ignoringComments);
            factory.setIgnoringElementContentWhitespace(ignoringElementContentWhitespace);
            factory.setNamespaceAware(namespaceAware);
            factory.setValidating(validating);
            factory.setXIncludeAware(includeAware);

            final DocumentBuilder builder = factory.newDocumentBuilder();

            final Document doc = builder.parse(fis);

            final StringBuilder buf = new StringBuilder(1000);
            buf.append(getResultDataHeader());
            for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
                final List<String> nodeStrList = new ArrayList<String>();
                try {
                    final NodeList nodeList = getNodeList(doc, entry.getValue());
                    for (int i = 0; i < nodeList.getLength(); i++) {
                        final Node node = nodeList.item(i);
                        nodeStrList.add(node.getTextContent());
                    }
                } catch (final TransformerException e) {
                    logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue(), e);
                }
                if (nodeStrList.size() == 1) {
                    buf.append(getResultDataBody(entry.getKey(), nodeStrList.get(0)));
                } else if (nodeStrList.size() > 1) {
                    buf.append(getResultDataBody(entry.getKey(), nodeStrList));
                }
            }
            buf.append(getAdditionalData(responseData, doc));
            buf.append(getResultDataFooter());

            final ResultData resultData = new ResultData();
            resultData.setTransformerName(getName());

            try {
                resultData.setData(buf.toString().getBytes(charsetName));
            } catch (final UnsupportedEncodingException e) {
                if (logger.isInfoEnabled()) {
                    logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
                }
                charsetName = Constants.UTF_8_CHARSET.name();
                resultData.setData(buf.toString().getBytes(Constants.UTF_8_CHARSET));
            }
            resultData.setEncoding(charsetName);

            return resultData;
        } catch (final RobotSystemException e) {
            throw e;
        } catch (final Exception e) {
            throw new RobotSystemException("Could not store data.", e);
        } finally {
            IOUtils.closeQuietly(fis);
            // clean up
            if (!tempFile.delete()) {
                logger.warn("Could not delete a temp file: " + tempFile);
            }
        }
    }

    protected NodeList getNodeList(final Document doc, final String xpath) throws TransformerException {
        final DefaultPrefixResolver prefixResolver = new DefaultPrefixResolver(
                doc.getNodeType() == Node.DOCUMENT_NODE ? doc.getDocumentElement() : doc);
        return getXPathAPI().eval(doc, xpath, prefixResolver).nodelist();
    }

    protected CachedXPathAPI getXPathAPI() {
        CachedXPathAPI cachedXPathAPI = xpathAPI.get();
        if (cachedXPathAPI == null) {
            cachedXPathAPI = new CachedXPathAPI();
            xpathAPI.set(cachedXPathAPI);
        }
        return cachedXPathAPI;
    }

    protected String getResultDataHeader() {
        // TODO support other type
        return "<?xml version=\"1.0\"?>\n<doc>\n";
    }

    protected String getResultDataBody(final String name, final String value) {
        // TODO support other type
        // TODO trim(default)
        return "<field name=\"" + XmlUtil.escapeXml(name) + "\">"
                + trimSpace(XmlUtil.escapeXml(value != null ? value : "")) + "</field>\n";
    }

    protected String getResultDataBody(final String name, final List<String> values) {
        final StringBuilder buf = new StringBuilder();
        buf.append("<list>");
        if (values != null && !values.isEmpty()) {
            for (final String value : values) {
                buf.append("<item>");
                buf.append(trimSpace(XmlUtil.escapeXml(value)));
                buf.append("</item>");
            }
        }
        buf.append("</list>");
        // TODO support other type
        // TODO trim(default)
        return "<field name=\"" + XmlUtil.escapeXml(name) + "\">" + buf.toString() + "</field>\n";
    }

    protected String getAdditionalData(final ResponseData responseData, final Document document) {
        return "";
    }

    protected String getResultDataFooter() {
        // TODO support other type
        return "</doc>";
    }

    protected String trimSpace(final String value) {
        if (trimSpace) {
            final Matcher matcher = SPACE_PATTERN.matcher(value);
            return matcher.replaceAll(" ").trim();
        }
        return value;
    }

    public void addAttribute(final String name, final Object value) {
        attributeMap.put(name, value);
    }

    public void addFeature(final String key, final String value) {
        featureMap.put(key, value);
    }

    public void addFieldRule(final String name, final String xpath) {
        fieldRuleMap.put(name, xpath);
    }

    /**
     * @return the fieldRuleMap
     */
    public Map<String, String> getFieldRuleMap() {
        return fieldRuleMap;
    }

    /**
     * @param fieldRuleMap the fieldRuleMap to set
     */
    public void setFieldRuleMap(final Map<String, String> fieldRuleMap) {
        this.fieldRuleMap = fieldRuleMap;
    }

    /**
     * @return the trimSpace
     */
    public boolean isTrimSpace() {
        return trimSpace;
    }

    /**
     * @param trimSpace the trimSpace to set
     */
    public void setTrimSpace(final boolean trimSpace) {
        this.trimSpace = trimSpace;
    }

    /**
     * @return the charsetName
     */
    public String getCharsetName() {
        return charsetName;
    }

    /**
     * @param charsetName the charsetName to set
     */
    public void setCharsetName(final String charsetName) {
        this.charsetName = charsetName;
    }

    /**
     * @return the dataClass
     */
    public Class<?> getDataClass() {
        return dataClass;
    }

    /**
     * @param dataClass the dataClass to set
     */
    public void setDataClass(final Class<?> dataClass) {
        this.dataClass = dataClass;
    }

    /**
     * @return the namespaceAware
     */
    public boolean isNamespaceAware() {
        return namespaceAware;
    }

    /**
     * @param namespaceAware the namespaceAware to set
     */
    public void setNamespaceAware(final boolean namespaceAware) {
        this.namespaceAware = namespaceAware;
    }

    /**
     * @return the coalescing
     */
    public boolean isCoalescing() {
        return coalescing;
    }

    /**
     * @param coalescing the coalescing to set
     */
    public void setCoalescing(final boolean coalescing) {
        this.coalescing = coalescing;
    }

    /**
     * @return the expandEntityRef
     */
    public boolean isExpandEntityRef() {
        return expandEntityRef;
    }

    /**
     * @param expandEntityRef the expandEntityRef to set
     */
    public void setExpandEntityRef(final boolean expandEntityRef) {
        this.expandEntityRef = expandEntityRef;
    }

    /**
     * @return the ignoringComments
     */
    public boolean isIgnoringComments() {
        return ignoringComments;
    }

    /**
     * @param ignoringComments the ignoringComments to set
     */
    public void setIgnoringComments(final boolean ignoringComments) {
        this.ignoringComments = ignoringComments;
    }

    /**
     * @return the ignoringElementContentWhitespace
     */
    public boolean isIgnoringElementContentWhitespace() {
        return ignoringElementContentWhitespace;
    }

    /**
     * @param ignoringElementContentWhitespace the ignoringElementContentWhitespace to set
     */
    public void setIgnoringElementContentWhitespace(final boolean ignoringElementContentWhitespace) {
        this.ignoringElementContentWhitespace = ignoringElementContentWhitespace;
    }

    /**
     * @return the validating
     */
    public boolean isValidating() {
        return validating;
    }

    /**
     * @param validating the validating to set
     */
    public void setValidating(final boolean validating) {
        this.validating = validating;
    }

    /**
     * @return the includeAware
     */
    public boolean isIncludeAware() {
        return includeAware;
    }

    /**
     * @param includeAware the includeAware to set
     */
    public void setIncludeAware(final boolean includeAware) {
        this.includeAware = includeAware;
    }

    public static class DefaultPrefixResolver extends PrefixResolverDefault {

        public DefaultPrefixResolver(final Node xpathExpressionContext) {
            super(xpathExpressionContext);
        }

        /*
         * (non-Javadoc)
         *
         * @see org.apache.xml.utils.PrefixResolverDefault#getNamespaceForPrefix(java.lang.String, org.w3c.dom.Node)
         */
        @Override
        public String getNamespaceForPrefix(final String prefix, final Node namespaceContext) {
            final String namespace = super.getNamespaceForPrefix(prefix, namespaceContext);
            if (StringUtil.isNotBlank(namespace)) {
                return namespace;
            }
            return "http://robot.codelibs.org/namespace/" + prefix;
        }

    }
}