org.xmlsh.commands.internal.xml2json.java Source code

Java tutorial

Introduction

Here is the source code for org.xmlsh.commands.internal.xml2json.java

Source

/**
 * $Id: xpwd.java 21 2008-07-04 08:33:47Z daldei $
 * $Date: 2008-07-04 04:33:47 -0400 (Fri, 04 Jul 2008) $
 *
 */

package org.xmlsh.commands.internal;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.List;

import javanet.staxutils.OutputFactory;

import javax.xml.crypto.dsig.TransformException;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import javax.xml.transform.stream.StreamSource;

import net.sf.saxon.s9api.DocumentBuilder;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.Serializer;
import net.sf.saxon.s9api.WhitespaceStrippingPolicy;
import net.sf.saxon.s9api.XdmNode;
import org.json.JSONObject;
import org.xmlsh.core.CoreException;
import org.xmlsh.core.InputPort;
import org.xmlsh.core.InvalidArgumentException;
import org.xmlsh.core.Options;
import org.xmlsh.core.OutputPort;
import org.xmlsh.core.UnexpectedException;
import org.xmlsh.core.XCommand;
import org.xmlsh.core.XValue;
import org.xmlsh.sh.shell.SerializeOpts;
import org.xmlsh.sh.shell.Shell;
import org.xmlsh.util.Util;

/*
 * 
 * Convert XML files to an CSV file
 * 
 * Arguments
 * 
 * -header      Add a header row
 * 
 * 
 */

public class xml2json extends XCommand {

    private boolean bIndent = false;
    private int mLevel = 0; // indentation level

    private SerializeOpts mSerializeOpts;

    private static final String kENCODING_UTF_8 = "UTF-8";
    private static final String kJXML_URI = "http://www.xmlsh.org/jxml";

    private static final QName kATTR_ENCODING = new QName("encoding");
    private static final QName kATTR_NAME = new QName("name");
    private static final QName kATTR_VALUE = new QName("value");
    private static final QName kATTR_SRC = new QName("src");
    private static final QName kATTR_UNWRAP = new QName("unwrap");
    private static final QName kATTR_HTML = new QName("html"); // A String formated as XHTML 

    private static final QName kELEM_XJSON = new QName(kJXML_URI, "xjson");
    private static final QName kELEM_FILE = new QName(kJXML_URI, "file");
    private static final QName kELEM_OBJECT = new QName(kJXML_URI, "object"); // A JSON Object
    private static final QName kELEM_MEMBER = new QName(kJXML_URI, "member"); // A JSON Object Member
    private static final QName kELEM_STRING = new QName(kJXML_URI, "string"); // A JSON STRING
    private static final QName kELEM_NUMBER = new QName(kJXML_URI, "number"); // A JSON NUMBER   
    private static final QName kELEM_ARRAY = new QName(kJXML_URI, "array"); // A JSON ARRAY
    private static final QName kELEM_BOOLEAN = new QName(kJXML_URI, "boolean"); // A JSON Literal (true,false)
    private static final QName kELEM_NULL = new QName(kJXML_URI, "null"); // A JSON Literal null

    public int run(List<XValue> args) throws Exception {
        Options opts = new Options("p=print", SerializeOpts.getOptionDefs());

        opts.parse(args);

        bIndent = opts.hasOpt("p");

        args = opts.getRemainingArgs();

        OutputPort stdout = getStdout();

        InputPort inp = args.isEmpty() ? getStdin() : getInput(args.get(0));

        SerializeOpts serializeOpts = getSerializeOpts(opts);
        XMLEventReader reader = inp.asXMLEventReader(serializeOpts);

        // Override the text encoding to UTF-8 - JSON is *always* USTF8
        mSerializeOpts = serializeOpts.clone();
        serializeOpts.setOutputTextEncoding(kENCODING_UTF_8);
        PrintWriter writer = stdout.asPrintWriter(serializeOpts);

        parse(reader, writer, false);
        writer.flush();
        writer.close();

        // Consume input or we can get a Piped Close
        while (reader.hasNext())
            reader.nextEvent();

        reader.close();
        inp.release();

        return 0;

    }

    private boolean parse(XMLEventReader reader, PrintWriter writer, boolean bComma) throws XMLStreamException,
            CoreException, UnsupportedEncodingException, IOException, TransformException, SaxonApiException {
        mLevel++;

        while (reader.hasNext()) {
            XMLEvent e = reader.nextEvent();
            if (e.isStartElement()) {
                StartElement start = e.asStartElement();
                QName name = start.getName();

                if (name.equals(kELEM_XJSON)) {
                    if (mLevel != 1)
                        throw new UnexpectedException("XJSON element must be at document root");

                    // Children become the new roots

                    mLevel = 0;
                    while (parse(reader, writer, bComma))
                        ;
                    return false;

                } else if (name.equals(kELEM_FILE)) {
                    if (!writeFile(start, reader, writer))
                        return false;

                }

                else if (bComma)
                    writer.print(",");

                if (name.equals(kELEM_OBJECT))
                    writeObject(start, reader, writer);
                else if (name.equals(kELEM_ARRAY))
                    writeArray(start, reader, writer);
                else if (name.equals(kELEM_MEMBER))
                    writeMember(start, reader, writer);
                else if (name.equals(kELEM_NUMBER))
                    writeNumber(start, reader, writer);
                else if (name.equals(kELEM_BOOLEAN))
                    writeBoolean(start, reader, writer);
                else if (name.equals(kELEM_NULL))
                    writeNull(reader, writer);
                else if (name.equals(kELEM_STRING))
                    writeString(start, reader, writer);
                else
                    readToEnd(reader);

                mLevel--;
                return true;

            } else if (e.isEndElement()) {
                mLevel--;

                return false;
            }
        }
        mLevel--;
        return false;
    }

    private boolean writeFile(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws UnsupportedEncodingException, IOException, XMLStreamException, CoreException, TransformException,
            SaxonApiException {
        Attribute aname = start.getAttributeByName(kATTR_NAME);
        if (aname == null)
            throw new InvalidArgumentException("Element FILE requries attribute name");
        String name = aname.getValue();

        //Attribute aencoding = start.getAttributeByName(new QName("encoding"));
        //String encoding = (aencoding == null ? "UTF-8" : aencoding.getValue());

        PrintWriter w = getShell().getEnv().getOutput(getShell().getFile(name), false)
                .asPrintWriter(mSerializeOpts);

        boolean ret = parse(reader, w, false);
        w.close();
        return ret;

    }

    private void writeString(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws XMLStreamException, UnsupportedEncodingException, FileNotFoundException, IOException,
            TransformException, SaxonApiException, CoreException {
        String value = getAttr(start, kATTR_VALUE);
        String src = getAttr(start, kATTR_SRC);
        String encoding = getAttr(start, kATTR_ENCODING);
        String unwrap = getAttr(start, kATTR_UNWRAP);
        String html = getAttr(start, kATTR_HTML);
        boolean bReadToEnd = true;
        String chars;
        if (value != null)
            chars = value;
        else if (src != null)
            chars = readFile(src, encoding);
        else {
            // readString eats the close tag
            bReadToEnd = false;
            chars = readString(reader, Util.parseBoolean(html));

        }

        // If Unwrap then trim off <html> and leading and trailing blanks
        if (Util.parseBoolean(unwrap)) {
            value = unwrap(value);

        }

        writer.print(JSONObject.quote(chars));
        if (bReadToEnd)
            readToEnd(reader);

    }
    /*
     * Parse an HTML element as XML and reserialize as HTML, store as a JSON string
     */

    private String readString(XMLEventReader reader, boolean bHTML)
            throws TransformException, XMLStreamException, SaxonApiException, IOException {

        byte[] bytes = bHTML ? serializeAsXML(reader) : serializeAsString(reader);

        // String xs = new String(xhtml,klENCODING_UTF_8);
        if (bHTML)
            return formatAsHtml(bytes);

        else
            return new String(bytes, kENCODING_UTF_8);

    }

    /*
     * Unwrap a string by 
     * 1) Remove leading and trailing blanks
     * 2) Remove any <html> (any case) from beginning and end
     * 3) Remove leading and trailing blanks from the result
     */
    private String unwrap(String value) {
        value = value.trim();
        if ("<html>".equalsIgnoreCase(value.substring(0, 6)))
            value = value.substring(6);
        if ("</html>".equalsIgnoreCase(value.substring(value.length() - 7)))
            value = value.substring(0, value.length() - 7);

        return value.trim();

    }

    private String readFile(String file, String encoding)
            throws UnsupportedEncodingException, FileNotFoundException, IOException, CoreException {
        InputPort ip = getShell().getInputPort(file);
        Reader r = new InputStreamReader(ip.asInputStream(mSerializeOpts),
                encoding == null ? mSerializeOpts.getInputTextEncoding() : encoding);
        StringBuffer sb = new StringBuffer();

        char cbuf[] = new char[1000];
        int n;
        while ((n = r.read(cbuf)) > 0)
            sb.append(cbuf, 0, n);

        r.close();
        ip.close();
        return sb.toString();

    }

    private String getAttr(StartElement start, QName attr) {
        Attribute a = start.getAttributeByName(attr);
        if (a == null)
            return null;
        return a.getValue();
    }

    private void writeNull(XMLEventReader reader, PrintWriter writer) throws XMLStreamException {
        writer.print("null");
        readToEnd(reader);
    }

    private void writeBoolean(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws XMLStreamException {
        writeNumber(start, reader, writer);

    }

    private void writeNumber(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws XMLStreamException {

        String chars;
        Attribute v = start.getAttributeByName(kATTR_VALUE);
        if (v != null)
            chars = v.getValue();
        else
            chars = readChars(reader);

        chars = chars.trim();

        writer.print(chars);
        readToEnd(reader);

    }

    private void writeMember(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws XMLStreamException, UnsupportedEncodingException, CoreException, IOException, TransformException,
            SaxonApiException {

        indent(writer);
        String name = start.getAttributeByName(new QName("name")).getValue();
        writer.print(JSONObject.quote(name));
        writer.print(":");

        if (parse(reader, writer, false))
            readToEnd(reader);

    }

    private void writeArray(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws XMLStreamException, UnsupportedEncodingException, CoreException, IOException, TransformException,
            SaxonApiException {
        indent(writer);
        writer.print("[");
        boolean bFirst = true;
        do {

            if (!parse(reader, writer, !bFirst))
                break;
            bFirst = false;
        } while (true);
        writer.print("]");

    }

    private void writeObject(StartElement start, XMLEventReader reader, PrintWriter writer)
            throws XMLStreamException, UnsupportedEncodingException, CoreException, IOException, TransformException,
            SaxonApiException {
        indent(writer);

        writer.print("{");
        boolean bFirst = true;
        do {

            if (!parse(reader, writer, !bFirst))
                break;
            bFirst = false;
        } while (true);
        indent(writer);
        writer.print("}");

    }

    /*
     * Serialize the body as HTML and return as a string
     */

    private String formatAsHtml(byte[] xhtml) throws SaxonApiException, UnsupportedEncodingException {

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        Serializer ser = Shell.getProcessor().newSerializer();
        ser.setOutputProperty(Serializer.Property.OMIT_XML_DECLARATION, "yes");
        ser.setOutputProperty(Serializer.Property.INDENT, "no");

        ser.setOutputProperty(Serializer.Property.METHOD, "html");
        ser.setOutputProperty(Serializer.Property.ENCODING, kENCODING_UTF_8);
        ser.setOutputStream(bos);

        Processor processor = Shell.getProcessor();
        DocumentBuilder builder = processor.newDocumentBuilder();
        builder.setWhitespaceStrippingPolicy(WhitespaceStrippingPolicy.ALL);
        XdmNode node = builder.build(new StreamSource(new ByteArrayInputStream(xhtml)));
        processor.writeXdmValue(node, ser);
        return bos.toString(kENCODING_UTF_8).trim();

    }

    /*
     * Serialize as XML
     */
    private byte[] serializeAsXML(XMLEventReader reader) throws XMLStreamException {

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        XMLOutputFactory fact = new OutputFactory();

        XMLEventWriter writer = fact.createXMLEventWriter(bos, kENCODING_UTF_8);
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();

            if (event.isEndElement() && event.asEndElement().getName().equals(kELEM_STRING))
                break;
            writer.add(event);
        }

        writer.flush();
        writer.close();
        return bos.toByteArray();

    }

    private byte[] serializeAsString(XMLEventReader reader)
            throws XMLStreamException, UnsupportedEncodingException, IOException {

        ByteArrayOutputStream bos = new ByteArrayOutputStream();

        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();

            if (event.isEndElement() && event.asEndElement().getName().equals(kELEM_STRING))
                break;
            if (event.isCharacters())
                bos.write(event.asCharacters().getData().getBytes("UTF-8"));
        }

        return bos.toByteArray();

    }

    private void indent(PrintWriter writer) {
        if (bIndent) {
            writer.println();
            for (int i = 0; i < mLevel; i++)
                writer.print(' ');

        }

    }

    private void readToEnd(XMLEventReader reader) throws XMLStreamException {
        while (reader.hasNext() && !reader.peek().isEndElement())
            reader.nextEvent();

        if (reader.hasNext())
            reader.nextEvent();

    }

    private String readChars(XMLEventReader reader) throws XMLStreamException {

        StringBuffer sb = new StringBuffer();
        while (reader.hasNext() && reader.peek().isCharacters()) {
            Characters ch = reader.nextEvent().asCharacters();
            sb.append(ch.getData());

        }
        return sb.toString();

    }

}

//
//
//Copyright (C) 2008-2014    David A. Lee.
//
//The contents of this file are subject to the "Simplified BSD License" (the "License");
//you may not use this file except in compliance with the License. You may obtain a copy of the
//License at http://www.opensource.org/licenses/bsd-license.php 
//
//Software distributed under the License is distributed on an "AS IS" basis,
//WITHOUT WARRANTY OF ANY KIND, either express or implied.
//See the License for the specific language governing rights and limitations under the License.
//
//The Original Code is: all this file.
//
//The Initial Developer of the Original Code is David A. Lee
//
//Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
//
//Contributor(s): none.
//