com.cloudera.recordbreaker.analyzer.XMLSchemaDescriptor.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.analyzer.XMLSchemaDescriptor.java
Source

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import java.io.IOException;
import java.util.Map;
import java.util.SortedMap;
import java.util.Set;
import java.util.List;
import java.util.HashSet;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Iterator;
import java.util.ArrayList;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.json.JSONArray;
import org.json.JSONObject;
import org.json.JSONException;

/*****************************************************************
 * <code>XMLSchemaDescriptor</code> builds an Avro-style schema out of the XML info.
 *
 * @author "Michael Cafarella"
 * @version 1.0
 * @since 1.0
 * @see SchemaDescriptor
 ******************************************************************/
public class XMLSchemaDescriptor extends GenericSchemaDescriptor {
    public static String SCHEMA_ID = "xml";
    TagEnvironment rootTag;

    /**
     * Creates a new <code>XMLSchemaDescriptor</code> instance.
     * Processes the input XML data and creates an Avro-compatible
     * Schema representation.
     */
    public XMLSchemaDescriptor(DataDescriptor dd) throws IOException {
        super(dd);
    }

    public XMLSchemaDescriptor(DataDescriptor dd, String schemaRepr, byte[] miscPayload) throws IOException {
        super(dd, schemaRepr);

        // Deserialize the root tag from the payload info
        try {
            this.rootTag = new TagEnvironment(new JSONObject(new String(miscPayload)));
            this.rootTag.setParent(null);
        } catch (JSONException jne) {
            jne.printStackTrace();
            throw new IOException("JSONException: " + jne.toString());
        }
    }

    public byte[] getPayload() {
        JSONObject rootJobj = rootTag.serialize();
        byte results[] = rootJobj.toString().getBytes();
        return results;
    }

    void computeSchema() throws IOException {
        SAXParserFactory factory = SAXParserFactory.newInstance();
        SAXParser parser = null;
        // Unfortunately, validation is often not possible
        factory.setValidating(false);

        try {
            // The XMLProcessor builds up a tree of tags
            XMLProcessor xp = new XMLProcessor();
            parser = factory.newSAXParser();
            parser.parse(dd.getRawBytes(), xp);

            // Grab the root tag
            this.rootTag = xp.getRoot();

            // Once the tree is built, we:
            // a) Find the correct repetition node (and throws out 'bad' repeats)
            // b) Flatten hierarchies of subfields into a single layer, so it's suitable
            //    for relational-style handling
            // c) Build an overall schema object that can summarize every expected
            //    object, even if the objects' individual schemas differ somewhat
            this.rootTag.completeTree();
        } catch (SAXException saxe) {
            throw new IOException(saxe.toString());
        } catch (ParserConfigurationException pcee) {
            throw new IOException(pcee.toString());
        }
    }

    ////////////////////////////////////////////////////////////////////
    // XMLProcessor creates a tree of TagEnvironment objects out of
    // an input XML file.
    // 
    // The result of a parse is a single TagEnvironment root object
    // (which points to the rest of the data).
    ////////////////////////////////////////////////////////////////////  
    class XMLProcessor extends DefaultHandler {
        List<TagEnvironment> environments = new ArrayList<TagEnvironment>();
        List<StringBuffer> tagData = new ArrayList<StringBuffer>();

        public XMLProcessor() {
            environments.add(new TagEnvironment(null, "<root>"));
        }

        public void startElement(String uri, String localName, String qName, Attributes attrs) {
            qName = qName.replace("-", "_");
            tagData.add(new StringBuffer());
            TagEnvironment parentEnvironment = environments.get(environments.size() - 1);
            TagEnvironment curEnvironment = new TagEnvironment(parentEnvironment, qName);
            parentEnvironment.addChild(curEnvironment);
            environments.add(curEnvironment);
        }

        public void characters(char[] ch, int start, int length) {
            tagData.get(tagData.size() - 1).append(new String(ch, start, length));
        }

        public void endElement(String uri, String localName, String qName) {
            qName = qName.replace("-", "_");
            String finalTagData = tagData.remove(tagData.size() - 1).toString();
            TagEnvironment curEnvironment = environments.remove(environments.size() - 1);
            curEnvironment.setData(finalTagData);
        }

        public TagEnvironment getRoot() {
            return environments.get(0);
        }
    }

    ////////////////////////////////////////////////////////////////////
    // TagEnvironment captures a tag and its associated data.
    //
    // There is only one 'repetitionNode' per tree.  This is the node whose
    // children will eventually become records when returned by getIterator().
    // We select the node that has more than one identically-named child AND
    // is closest to the root.
    //
    // Other nodes may have repeated children but which which are further
    // from the root than the repetitionNode.  Maybe someday we will compute
    // a kind of cartesian product among multiple repetitionNodes, but for the moment
    // we simply throw out these extra repeated fields.  We ignore the fields
    // starting with the 2nd appearance.
    //
    ////////////////////////////////////////////////////////////////////
    class TagEnvironment {
        // Several members are appropriate for every node in the tree.
        String label;
        String data;
        boolean repetitionNode;
        TagEnvironment parent;
        List<TagEnvironment> children = new ArrayList<TagEnvironment>();

        // These fields are only filled in for TagEnvironment nodes that
        // represent genuine tuples.  They are inappropriate for most
        // nodes in the tree.
        List<Object> typedFields = new ArrayList<Object>();
        List<Schema.Field> fieldSchemas = new ArrayList<Schema.Field>();

        /**
         * Deserialize the tagenvironment object from the provided JSON object.
         */
        public TagEnvironment(JSONObject jsonPayload) throws JSONException {
            // label, data, repetition
            this.label = (String) jsonPayload.get("label");
            this.data = (String) jsonPayload.get("data");
            this.repetitionNode = jsonPayload.getBoolean("isRepetitionNode");

            // typedFields and fieldSchemas
            JSONArray jsonFieldSchemas = jsonPayload.getJSONArray("fieldSchemas");
            this.fieldSchemas = new ArrayList<Schema.Field>();
            this.typedFields = new ArrayList<Object>();
            for (int i = 0; i < jsonFieldSchemas.length(); i++) {
                JSONObject jsonFieldSchema = jsonFieldSchemas.getJSONObject(i);
                String name = (String) jsonFieldSchema.get("name");
                Schema s = Schema.parse((String) jsonFieldSchema.get("schema"));
                this.fieldSchemas.add(new Schema.Field(name, s, "", null));

                if (s.getType() == Schema.Type.INT) {
                    this.typedFields.add(jsonFieldSchema.getInt("value"));
                } else if (s.getType() == Schema.Type.DOUBLE) {
                    this.typedFields.add(jsonFieldSchema.getDouble("value"));
                } else if (s.getType() == Schema.Type.LONG) {
                    this.typedFields.add(jsonFieldSchema.getLong("value"));
                } else {
                    this.typedFields.add((String) jsonFieldSchema.get("value"));
                }
            }

            // children
            JSONArray jsonChildren = jsonPayload.getJSONArray("children");
            this.children = new ArrayList<TagEnvironment>();
            for (int i = 0; i < jsonChildren.length(); i++) {
                TagEnvironment child = new TagEnvironment(jsonChildren.getJSONObject(i));
                child.setParent(this);
                children.add(child);
            }
        }

        /**
         * Simply create a node in the tag tree.
         */
        public TagEnvironment(TagEnvironment parent, String label) {
            this.parent = parent;
            this.label = label;
            this.repetitionNode = false;
        }

        //////////////////////////////////////////////
        // Called by XMLProcessor during XML parsing
        //////////////////////////////////////////////
        public void addChild(TagEnvironment child) {
            this.children.add(child);
        }

        public void setData(String data) {
            this.data = data;
        }

        ////////////////////////////////////////////////
        // Serialize the tag (and its descendents) into a JSON object
        ////////////////////////////////////////////////
        public JSONObject serialize() {
            try {
                // label, data, repetition
                JSONObject jobj = new JSONObject();
                jobj.put("label", label);
                jobj.put("data", (data == null) ? "" : data);
                jobj.put("isRepetitionNode", repetitionNode);

                // typedFields and fieldSchemas
                JSONArray jsonFieldSchemas = new JSONArray();
                for (int i = 0; i < fieldSchemas.size(); i++) {
                    Schema.Field sf = fieldSchemas.get(i);
                    JSONObject jsonFieldSchema = new JSONObject();
                    jsonFieldSchema.put("name", sf.name());
                    jsonFieldSchema.put("schema", sf.schema().toString());

                    if (sf.schema().getType() == Schema.Type.INT) {
                        jsonFieldSchema.put("value", ((Integer) typedFields.get(i)).intValue());
                    } else if (sf.schema().getType() == Schema.Type.DOUBLE) {
                        jsonFieldSchema.put("value", ((Double) typedFields.get(i)).doubleValue());
                    } else if (sf.schema().getType() == Schema.Type.LONG) {
                        jsonFieldSchema.put("value", ((Long) typedFields.get(i)).longValue());
                    } else {
                        jsonFieldSchema.put("value", (String) typedFields.get(i));
                    }
                    jsonFieldSchemas.put(jsonFieldSchema);
                }
                jobj.put("fieldSchemas", jsonFieldSchemas);

                // children
                JSONArray jsonChildren = new JSONArray();
                jobj.put("children", jsonChildren);
                for (TagEnvironment child : children) {
                    jsonChildren.put(child.serialize());
                }
                return jobj;
            } catch (JSONException jne) {
                jne.printStackTrace();
                return null;
            }
        }

        public void setParent(TagEnvironment parent) {
            this.parent = parent;
        }

        ////////////////////////////////////////////////
        // Called after the initial XML parse is done.
        // We now figure out various whole-tree properties.
        ////////////////////////////////////////////////
        /**
         * completeTree() figures out the correct level of tuple-repetition.
         * Then it assigns data fields to the correct repeated-node(s).
         * Finally, it computes a single schema that captures all of the fields
         * found across the repeated-node set.
         */
        public void completeTree() {
            this.completeTree(false);
            this.hoistData();
            //addField("XMLExtractedSchema", new Schema.Field("name", Schema.create(Schema.Type.STRING), "", null));
            schema = this.getUnifiedSchema(true);
            Schema.Field sf = schema.getField("name");
        }

        private void completeTree(boolean repetitionNodeFound) {
            Map<String, Integer> nameCounts = new TreeMap<String, Integer>();
            for (TagEnvironment child : children) {
                String l = child.getLabel();
                Integer prevCount = nameCounts.get(l);
                if (prevCount == null) {
                    prevCount = new Integer(0);
                }
                nameCounts.put(l, prevCount.intValue() + 1);
            }
            // If at least one child type appears more than once, then it
            // could be the iterator node.
            if (children.size() > nameCounts.size()) {
                if (!repetitionNodeFound) {
                    this.repetitionNode = true;
                    repetitionNodeFound = true;
                }

                //
                // If this is the iterator node, keep child-repetitions, but throw
                // out the non-repeated-target children.
                //
                // If this is not the iterator node, throw out any child repetitions.
                // Keep only unique attrs
                //
                if (this.repetitionNode) {
                    // This is the repetitionNode.  Throw out all children except the
                    // repeated one.
                    //
                    // NOTE: this gets rid of one-off metadata that we might like to retain
                    // in the future.
                    //
                    if (nameCounts.size() > 1) {
                        int maxCount = -1;
                        String maxKey = null;
                        for (Map.Entry<String, Integer> e : nameCounts.entrySet()) {
                            if (e.getValue() > maxCount) {
                                maxKey = e.getKey();
                                maxCount = e.getValue();
                            }
                        }

                        for (Iterator<TagEnvironment> it = children.iterator(); it.hasNext();) {
                            TagEnvironment child = it.next();
                            if (!child.label.equals(maxKey)) {
                                it.remove();
                            }
                        }
                    }
                } else {
                    // Throw out child repetitions.  This is not the legal repetition-parent-node
                    // 
                    // NOTE: Eventually, we could use the multiple values to expand a single
                    // tuple into multiple rows, but that's too complicated for the moment
                    // (and it's unclear if the data warrants it)
                    //
                    Set<String> seenLabels = new HashSet<String>();
                    for (Iterator<TagEnvironment> it = children.iterator(); it.hasNext();) {
                        TagEnvironment child = it.next();
                        Integer count = nameCounts.get(child.label);
                        if (seenLabels.contains(child.label) && count.intValue() > 1) {
                            it.remove();
                        }
                        seenLabels.add(child.label);
                    }
                }
            }

            // If there is more than one child type, then throw out all but
            // the most common child.
            //
            for (TagEnvironment child : children) {
                child.completeTree(repetitionNodeFound);
            }
        }

        /////////////////////////////////////////////////////////////////
        // hoistData() and several helper functions flatten subfield trees
        // and assigns the resulting flattened fields to the repeated-node.
        /////////////////////////////////////////////////////////////////
        void hoistData() {
            if (children.size() == 0) {
                TagEnvironment hoistTarget = getHoistTarget();
                String fieldLabel = buildHoistedLabel(hoistTarget);
                if (fieldLabel.length() == 0) {
                    fieldLabel = getLabel();
                }
                hoistTarget.addField(getTypedData(),
                        new Schema.Field(fieldLabel, Schema.create(getDataSchema()), "", null));
            } else {
                for (TagEnvironment child : children) {
                    child.hoistData();
                }
            }
        }

        void addField(Object val, Schema.Field sfield) {
            this.typedFields.add(val);
            this.fieldSchemas.add(sfield);
        }

        TagEnvironment getHoistTarget() {
            if (parent.repetitionNode) {
                return this;
            } else {
                return (parent != null) ? parent.getHoistTarget() : parent;
            }
        }

        String buildHoistedLabel(TagEnvironment target) {
            if (this == target) {
                return "";
            } else {
                String result = parent.buildHoistedLabel(target);
                if (result.length() > 0) {
                    return result + "_" + label;
                } else {
                    return label;
                }
            }
        }

        /**
         * Once the repetition node has been found and the fields assigned,
         * we can compute a Schema that captures fields in all repeated-nodes.
         *
         * This is to handle cases when fields only appear in a subset of nodes.
         * The unified Schema will be the union of all observed fields.
         */
        Schema getUnifiedSchema(boolean initial) {
            if (typedFields.size() > 0) {
                // Build and return schema.
                if (initial) {
                    Schema s = Schema.createRecord("name", "", "XMLExtractedSchema", false);
                    s.setFields(fieldSchemas);
                    return s;
                } else {
                    return Schema.createRecord(fieldSchemas);
                }
            } else {
                // Grab all child schemas, and unify.
                Map<String, Schema.Field> observedFields = new TreeMap<String, Schema.Field>();
                for (TagEnvironment child : children) {
                    Schema s = child.getUnifiedSchema(false);
                    if (s != null) {
                        for (Schema.Field childField : s.getFields()) {
                            if (observedFields.get(childField.name()) == null) {
                                observedFields.put(childField.name(), childField);
                            }
                        }
                    }
                }
                if (observedFields.size() == 0) {
                    return null;
                } else {
                    List<Schema.Field> singleList = new ArrayList<Schema.Field>();
                    for (Map.Entry<String, Schema.Field> cur : observedFields.entrySet()) {
                        Schema.Field sf = cur.getValue();
                        singleList.add(new Schema.Field(sf.name(), Schema.create(sf.schema().getType()), "", null));
                    }
                    if (initial) {
                        Schema s = Schema.createRecord("XMLSchema", "", "XMLExtractedSchema", false);
                        s.setFields(singleList);
                        return s;
                    } else {
                        return Schema.createRecord(singleList);
                    }
                }
            }
        }

        ////////////////////////////////////////////////////////
        // Accessors
        ////////////////////////////////////////////////////////
        public String getLabel() {
            return label;
        }

        /**
         * Used by the Iterator to collect data objects that obey the
         * whole-tree schema.
         */
        public void accumulateObjects(List<Object> accumulator) {
            if (!this.repetitionNode) {
                for (TagEnvironment child : children) {
                    child.accumulateObjects(accumulator);
                }
            } else {
                for (TagEnvironment child : children) {
                    accumulator.add(child.buildRecord(schema));
                }
            }
        }

        /**
         * Helper function for accumulateObjects()
         */
        Object buildRecord(Schema s) {
            GenericData.Record cur = new GenericData.Record(s);
            for (int i = 0; i < typedFields.size(); i++) {
                Object typedFieldObj = typedFields.get(i);
                String typedFieldLabel = fieldSchemas.get(i).name();
                cur.put(typedFieldLabel, typedFieldObj);
            }
            return cur;
        }

        /**
         * Return the typed version of any data stored at this tag node.
         */
        Object getTypedData() {
            if (data == null) {
                return null;
            }
            try {
                return Integer.parseInt(data);
            } catch (NumberFormatException nfe) {
            }
            try {
                return Double.parseDouble(data);
            } catch (NumberFormatException nfe) {
            }
            try {
                return Long.parseLong(data);
            } catch (NumberFormatException nfe) {
            }
            return data;
        }

        /**
         * Return a schema that describes the type of any data stored at this
         * tag node.
         */
        Schema.Type getDataSchema() {
            if (data == null) {
                return null;
            }
            try {
                Integer.parseInt(data);
                return Schema.Type.INT;
            } catch (NumberFormatException nfe) {
            }
            try {
                Double.parseDouble(data);
                return Schema.Type.DOUBLE;
            } catch (NumberFormatException nfe) {
            }
            try {
                Long.parseLong(data);
                return Schema.Type.LONG;
            } catch (NumberFormatException nfe) {
            }
            return Schema.Type.STRING;
        }

        public String toString() {
            return "Tag " + label + " (" + children.size() + " children)";
        }
    }

    /**
     * Return an object that steps through all the data items in the file.
     * It's a bit unclear how this should work, as an XML file is really a tree,
     * but we usually assume an Iterator is giving back tuples.  So what's a "row"
     * in the case of XML?
     *
     * Ideally, the iterator gives back a result for every "repeated leaf".  Just
     * being a leaf in the XML tree is not enough: there should be a level of
     * repetition that makes it interesting.  If there's no repetition then
     * the object is just returned once.
     *
     * We can detect when repeats happen: it yields an Avro array.  But what about
     * when we have multiple levels of this?  Probably we want a row for the
     * product of repetitions.
     *
     * Definitely do not want a record for each individual leaf-level field.
     *
     * Maybe we want to fill out as many non-exclusive columns as we can?
     * As soon as a repetition means we have a column-repetition-conflict, then
     * time for a new row.
     * 
     * How do we translate an Avro tree into a series of tuples?  Basically
     * we have one new Record for each leaf-level array entry.  There is never
     * an array in an object returned by getIterator().  If it's just a nested
     * set of Records, then it's one big tuple.
     */
    public Iterator getIterator() {
        return new Iterator() {
            // NOTE: We should eventually modify this so we don't have to
            // materialize the entire tree.  This is pretty inefficient right now.
            Object nextElt = null;
            List<Object> lookaheadList = new ArrayList<Object>();
            {
                rootTag.accumulateObjects(lookaheadList);
                nextElt = lookahead();
            }

            public boolean hasNext() {
                return nextElt != null;
            }

            public synchronized Object next() {
                Object toReturn = nextElt;
                nextElt = lookahead();
                return toReturn;
            }

            public void remove() {
                throw new UnsupportedOperationException();
            }

            Object lookahead() {
                if (lookaheadList.size() > 0) {
                    return lookaheadList.remove(0);
                } else {
                    return null;
                }
            }
        };
    }

    /**
     * It's an XML file
     */
    public String getSchemaSourceDescription() {
        return SCHEMA_ID;
    }
}