tufts.vue.ds.XMLIngest.java Source code

Java tutorial

Introduction

Here is the source code for tufts.vue.ds.XMLIngest.java

Source

/*
* Copyright 2003-2010 Tufts University  Licensed under the
 * Educational Community License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 * 
 * http://www.osedu.org/licenses/ECL-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS IS"
 * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package tufts.vue.ds;

import tufts.Util;
import tufts.vue.DEBUG;
import tufts.vue.MetaMap;
import tufts.vue.MetaMap.*;

import java.util.*;
import java.io.*;
import java.net.*;

import javax.xml.xpath.*;
import javax.xml.parsers.*;

import org.w3c.dom.Node;
import org.w3c.dom.*;
import org.xml.sax.*;

// TODO: just forget handling depth (e.g., jira comments) for now -- can tackle later.
// The keep in mind w/respect to how we handle data-set interation, so could
// add this under the hood if we like later.

// As for our data-model, we could literally use the XML DOM, tho that's got
// way more than we need in it and isn't very convenient.  I guess we
// just need our own API that nicely abstracts everything, so under the
// hood we could use anything from Jackrabbit to DOM to Mutlimaps to SQL
// or whatever.

// NEED TO GENERICALLY HANDLE KEY MANAGEMENT, AND DATA-CHANGE DETECTION.

// Big question: do we persist original raw XML streams, or digest
// the data first then persist it?  First case is safer for
// ultimate data integrity -- can fix parsing / data coalesecing
// bugs or make enhancements more easily.  We could persist the
// mashed data, but but then we'd just need another format / persist
// schema anyway.

/**
 * @version $Revision: 1.16 $ / $Date: 2010-02-03 19:13:16 $ / $Author: mike $
 * @author Scott Fraize
 */

public class XMLIngest {

    private static final org.apache.log4j.Logger Log = org.apache.log4j.Logger.getLogger(XMLIngest.class);

    private static final boolean XML_DEBUG = false;
    private static final boolean XML_OUTPUT = false;

    // REPLACE WITH A WRAPPER: XMLIngestor / XML-SCHEMA-LOADER (or just split out as XMLIngest methods)
    // ALL WE NEED are the track methods, and a public final schema for dumpSchema debug & isXMLKeyFold(),
    // which we can probably pull out of generic Schema

    public static class XmlSchema extends tufts.vue.ds.Schema {
        final String itemPath;
        final int itemPathLen;

        DataRow curRow;

        /** castor peristance only */
        // todo: see if we can get rid of this class entirely and have Schema be a final class
        // so we don't run into all the persistance complications with castor.
        public XmlSchema() {
            itemPath = "<unknown>";
            itemPathLen = 0;
        }

        public XmlSchema(tufts.vue.Resource source, String itemPath) {
            super.setResource(source);
            this.itemPath = itemPath;
            if (itemPath == null || itemPath.length() == 0)
                itemPathLen = 0;
            else
                itemPathLen = itemPath.length() + 1; // add one for dot
            setXMLKeyFold(itemPath != null && itemPath.startsWith("plist."));

            Log.debug("Constructed XmlSchema " + this);

            //itemPathLen = itemPath.length() + (itemPath.endsWith(".") ? 0 : 1);
        }

        @Override
        public void dumpSchema(PrintWriter ps) {
            if (itemPath != null)
                ps.println("ItemPath: " + itemPath);
            super.dumpSchema(ps);
        }

        void trackFieldValuePair(String name, String value) {

            //errout("TRACK " + name + "=" + value);

            if (itemPath != null && name.startsWith(itemPath) && name.length() > itemPathLen)
                name = name.substring(itemPathLen);

            //             Field field = mFields.get(name);
            //             if (field == null) {
            //                 field = new Field(name, this);
            // //                 if (name.equals(getKeyNode()))
            // //                     keyField = field;
            //                 mFields.put(name, field);
            //                 if (name.length() > mLongestFieldName)
            //                     mLongestFieldName = name.length();
            //             }
            Field field = getField(name);
            if (field == null) {
                field = addField(name);
                if (name.length() > mLongestFieldName)
                    mLongestFieldName = name.length();
            }
            if (curRow != null)
                curRow.addValue(field, value);
            else
                field.trackValue(value);
        }

        void trackNodeOpen(String name) {
            if (name.equals(getRowStartNode())) {
                //errout("OPEN " + name);
                // curRow = new VRow(fields.size()); // fields includes non-row-extraction values
                curRow = new DataRow(this);
                addRow(curRow);
            }
        }

        void trackNodeClose(String name) {
            if (name.equals(getRowStartNode())) {
                //errout(String.format("CLOSE %s with %2d fields, key %s", name, curRow.size(), curRow.getValue(keyField)));
                curRow = null;
            }
        }

        private String getRowStartNode() {
            return itemPath;
        }
        //private String getKeyNode() { return null; }

    }

    //     static class RssSchema extends XmlSchema {

    //         @Override
    //         final String getRowStartNode() {
    //             //return "item";
    //             return "rss.channel.item";
    //         }

    //         @Override
    //         final String getKeyNode() {
    //             //return "rss.channel.item.key";
    //             //return "item.key";
    //             return "key";
    //         }

    //     }

    static int depth = 0;

    static void XPathExtract(XmlSchema schema, Document document) {

        try {

            XPath xpath = XPathFactory.newInstance().newXPath();

            String expression = "/rss/channel/item";
            //String expression = "rss/channel/item/title";

            errout("Extracting " + expression);

            // First, obtain the element as a node.

            //tufts.DocDump.dump(document);

            Node nodeValue = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
            errout("   Node: " + nodeValue);

            // Next, obtain the element as a String.

            String stringValue = (String) xpath.evaluate(expression, document, XPathConstants.STRING);
            System.out.println(" String: " + stringValue);

            NodeList nodeSet = (NodeList) xpath.evaluate(expression, document, XPathConstants.NODESET);
            errout("NodeSet: " + Util.tag(nodeSet) + "; size=" + nodeSet.getLength());

            for (int i = 0; i < nodeSet.getLength(); i++) {
                scanNode(schema, nodeSet.item(i), null, null);
            }

            //             // Finally, obtain the element as a Number (Double).

            //             Double birthdateDouble = (Double) xpath.evaluate(expression, document, XPathConstants.NUMBER);

            //             System.out.println("Double is: " + birthdateDouble);

        } catch (XPathExpressionException e) {
            System.err.println("XPathExpressionException caught...");
            e.printStackTrace();
        } catch (Throwable t) {
            t.printStackTrace();
        }
    }

    public static Schema ingestXML(XmlSchema schema, org.xml.sax.InputSource input, String itemKey) {
        final org.w3c.dom.Document doc = parseXML(input, false);

        //doc.normalizeDocument();
        if (DEBUG.DR) {
            try {
                errout("XML parsed, document built:");
                errout("org.w3c.dom.Document: " + Util.tags(doc));
                final org.w3c.dom.DocumentType type = doc.getDoctype();
                //errout("InputEncoding: " + doc.getInputEncoding()); // AbstractMethodError ?
                //errout("xmlEncoding: " + doc.getXmlEncoding()); // AbstractMethodError
                //errout("xmlVersion: " + doc.getXmlVersion()); // AbstractMethodError
                errout("docType: " + Util.tags(type));
                if (type != null) {
                    errout("docType.name: " + Util.tags(type.getName()));
                    errout("docType.entities: " + Util.tags(type.getEntities()));
                    errout("docType.notations: " + Util.tags(type.getNotations()));
                    errout("docType.publicId: " + Util.tags(type.getPublicId()));
                    errout("docType.systemId: " + Util.tags(type.getSystemId()));
                }
                errout("impl: " + Util.tags(doc.getImplementation().getClass()));
                errout("docElement: " + Util.tags(doc.getDocumentElement().getClass())); // toString() can dump whole document!
            } catch (Throwable t) {
                Log.error("debug failure", t);
            }
        }
        //out("element: " + Util.tags(doc.getDocumentElement()));

        //outln("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
        //outln("<!-- created by RSSTest " + new Date() + " from " + src + " -->");

        if (schema == null)
            schema = new XmlSchema(tufts.vue.Resource.instance(input), itemKey);
        else
            schema.flushData();

        if (false)
            XPathExtract(schema, doc);
        else
            scanNode(schema, doc.getDocumentElement(), null, null);

        if (DEBUG.DR || DEBUG.SCHEMA)
            schema.dumpSchema(System.err);
        return schema;
    }

    private static boolean isText(int type) {
        return type == Node.TEXT_NODE || type == Node.CDATA_SECTION_NODE;
    }

    private static boolean isText(Node node) {
        return isText(node.getNodeType());
    }

    private static final String getNodeType(Node n) {
        return getNodeType(n.getNodeType());
    }

    private static final String getNodeType(int t) {
        if (t == Node.ATTRIBUTE_NODE)
            return "attr";
        if (t == Node.CDATA_SECTION_NODE)
            return "cdata";
        if (t == Node.COMMENT_NODE)
            return "comment";
        if (t == Node.DOCUMENT_NODE)
            return "document";
        if (t == Node.ELEMENT_NODE)
            return "element";
        if (t == Node.ENTITY_NODE)
            return "entity";
        if (t == Node.TEXT_NODE)
            return "text";
        return "" + t;

    }

    // parentPath is the fully-qualified parent name

    private static void scanNode(XmlSchema schema, org.w3c.dom.Node n, String parentPath, String parentName) {

        final int type = n.getNodeType();
        final String value = n.getNodeValue();
        final boolean isAttribute = (type == Node.ATTRIBUTE_NODE);
        String name = n.getNodeName();

        scanNode(schema, n, type, parentPath, parentName, name, value);
    }

    private static void scanNode(final XmlSchema schema, final org.w3c.dom.Node node, final int type,
            final String parentPath, final String parentName, final String nodeName, final String value) {
        final boolean isAttribute = (type == Node.ATTRIBUTE_NODE);
        final boolean isMergedText = FOLD_TEXT && isText(type);
        final boolean hasAttributes = (!isAttribute && node != null && node.hasAttributes());
        Node firstChild = null, lastChild = null;

        if (node != null) {
            firstChild = node.getFirstChild();
            lastChild = node.getLastChild();
        }

        final String XMLName;

        if (isAttribute)
            XMLName = parentName + ATTR_SEPARATOR + nodeName;
        else
            XMLName = nodeName;

        final String fullName;

        if (parentPath != null) { // should only be null first time in at the top root
            if (isMergedText)
                fullName = parentPath;
            else if (isAttribute)
                fullName = parentPath + ATTR_SEPARATOR + nodeName;
            else
                fullName = parentPath + '.' + nodeName;
        } else {
            fullName = nodeName;
        }

        if (type == Node.ELEMENT_NODE)
            schema.trackNodeOpen(fullName);

        if (depth < REPORT_THRESH) {
            if (depth < REPORT_THRESH - 1) {
                if (type == Node.TEXT_NODE)
                    eoutln(String.format("node(%s) {%s} (len=%d)", getNodeType(type), fullName, value.length()));
                else
                    eoutln(String.format("NODE(%s) {%s} %.192s", getNodeType(type), fullName, node,
                            Util.tags(firstChild)));
            }
            //eoutln("NODE: " + type + " name=" + name + " " + Util.tags(n) + " firstChild=" + Util.tags(firstChild));
            //System.err.println(name);
            else if (XML_DEBUG)
                System.err.print(".");
        }

        if (hasAttributes && ATTRIBUTES_IMMEDIATE)
            scanAttributes(schema, fullName, nodeName, node.getAttributes());

        String outputValue = null;

        if (value != null) {
            outputValue = value.trim();
            if (outputValue.length() > 0) {
                schema.trackFieldValuePair(fullName, outputValue);
            } else
                outputValue = null;
        }

        final NodeList children = node == null ? null : node.getChildNodes();
        final boolean DO_TAG;

        if (isMergedText) {
            DO_TAG = false;
        } else if (outputValue == null && node != null) {
            if (!node.hasChildNodes()) {
                DO_TAG = false;
            } else if (children.getLength() == 1 && isText(firstChild)
                    && firstChild.getNodeValue().trim().length() == 0) {
                DO_TAG = false;
            } else
                DO_TAG = true;

            // if (!DO_TAG) ioutln("<!-- empty: " + nodeName + " -->");
        } else
            DO_TAG = true;

        boolean closeOnSameLine = false;

        if (DO_TAG) {

            iout("<");
            out(XMLName);
            //if (node.hasChildNodes()) out(" children=" + node.getChildNodes().getLength() + " first=" + node.getFirstChild());
            out(">");

            if (firstChild == null || (isText(firstChild) && firstChild == lastChild)) {
                //                 if (firstChild != null && firstChild.getNodeType() == Node.CDATA_SECTION_NODE)
                //                     ;
                //                 else
                closeOnSameLine = true;
            } else if (XML_OUTPUT)
                System.out.print('\n');

            if (FOLD_TEXT && (type != Node.ELEMENT_NODE && type != Node.ATTRIBUTE_NODE)) {
                final String err = "UNHANDLED TYPE=" + type + "; " + nodeName;
                outln("<" + err + ">");
                errout(err);
            }
        }

        if (outputValue != null) {
            if (type == Node.CDATA_SECTION_NODE) {
                out("<![CDATA[");
                out(outputValue);
                out("]]>");
            } else {
                out(XMLEntityEncode(outputValue));
            }
        }

        if (!isAttribute && node != null) {

            // god knows why, but attributes have themselves as children?  (or is that
            // the #text entry?)  Anyway, if we allow this for an attribute dump, the
            // value of the attribute will literally appear twice in the output,
            // back-to-back as one string.

            depth++;

            if (FOLD_KEYS || schema.isXMLKeyFold()) {

                scanFoldedChildren(schema, children, fullName, nodeName);

            } else {

                for (int i = 0; i < children.getLength(); i++)
                    scanNode(schema, children.item(i), fullName, nodeName);
            }

            depth--;

        }

        if (DO_TAG) {

            if (closeOnSameLine)
                outln("</" + XMLName + ">");
            else
                ioutln("</" + XMLName + ">");
        }

        if (type == Node.ELEMENT_NODE)
            schema.trackNodeClose(fullName);

        if (hasAttributes && !ATTRIBUTES_IMMEDIATE)
            scanAttributes(schema, fullName, nodeName, node.getAttributes());

        //iout("children: " + Util.tags(n.getChildNodes()));
    }

    private static void scanAttributes(XmlSchema schema, String fullName, String nodeName, NamedNodeMap attr) {

        if (attr != null && attr.getLength() > 0) {
            //depth++;
            for (int i = 0; i < attr.getLength(); i++) {
                final Node a = attr.item(i);
                scanNode(schema, a, fullName, nodeName);
            }
            //depth--;
        }
    }

    private static void scanFoldedChildren(XmlSchema schema, final NodeList children, final String fullName,
            final String nodeName) {
        // Test code for folding Apple plist style <dict> pairs (<key>UserKey</key><string>UserValue</string>)
        // using iTunes Music Library.xml as test case.

        for (int i = 0; i < children.getLength(); i++) {
            final Node item = children.item(i);
            final Node next = children.item(i + 1);

            if (next != null) {
                final String nextName = next.getNodeName();
                //errout("checking pair: " + item.getNodeName() + "/" + nextName); 
                //if ("key".equals(item.getNodeName()) && !"dict".equals(nextName)) {
                if ("key".equals(item.getNodeName())) {
                    //final String newNodeName = item.getNodeValue();
                    //final String newNodeValue = next.getNodeValue();

                    // must extract through one more layer of indirection

                    String newNodeName = item.getChildNodes().item(0).getNodeValue();

                    if (newNodeName != null)
                        newNodeName = newNodeName.replace(' ', '_');

                    final String newNodeValue;

                    if ("true".equals(nextName)) {
                        //newNodeValue = next.getNodeValue()
                        newNodeValue = "true"; // is a simle "<true/>" self-terminating value with/NO CHILDREN
                    } else if ("false".equals(nextName)) {
                        // almost never see this in iTunes Music Library.xml
                        //errout("GOT FALSE");
                        newNodeValue = "false";
                    } else if ("dict".equals(nextName) || "array".equals(nextName)) {

                        continue;

                        //                                 //newNodeValue = "(todo: pull-up under: " + nextName + ")";
                        //                                 newNodeValue = nextName;
                        //                                 i--;  // we're not extracting this yet, so don't pull it out below
                    } else {
                        newNodeValue = next.getChildNodes().item(0).getNodeValue();
                    }

                    //if ("Visible".equals(newNodeName)) errout("VALUE: " + newNodeValue);

                    //errout(String.format("\t%s=[%s]", newNodeName, newNodeValue));
                    //errout("value children: " + item.getChildNodes());
                    // extract the current node value as a new node name, and the next node value as the new node value
                    scanNode(schema, null, Node.ELEMENT_NODE, fullName, nodeName, newNodeName, newNodeValue);
                    i++;
                    continue;
                }
            }

            scanNode(schema, item, fullName, nodeName);
        }
    }

    /*
    public static void dumpElement(Element e) {
    out("\tElement: " + Util.tags(e));
    out("\tElement tag: " + e.getTagName());
    out("\tElement SchemaTypeInfo: " + Util.tags(e.getSchemaTypeInfo()));
    }
    */

    // Parses an XML file and returns a DOM document.
    // If validating is true, the contents is validated against the DTD
    // specified in the file.
    private static org.w3c.dom.Document parseXML(Object input, boolean validating) {
        try {
            // Create a builder factory
            javax.xml.parsers.DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setIgnoringElementContentWhitespace(true);
            factory.setIgnoringComments(true);
            //factory.setCoalescing(true);
            factory.setValidating(validating);

            // Create the builder and parse the file
            final org.w3c.dom.Document doc;
            if (input instanceof String) {
                doc = factory.newDocumentBuilder().parse(new File((String) input));
            } else if (input instanceof InputSource) {
                doc = factory.newDocumentBuilder().parse((InputSource) input);
            } else if (input instanceof InputStream) {
                //                 InputSource encoded = new InputSource();
                //                 encoded.setByteStream((InputStream)input);
                //                 encoded.setEncoding("ISO-8859-1"); // TODO: get from url stream
                //                 doc = factory.newDocumentBuilder().parse(encoded);
                //                 //doc = factory.newDocumentBuilder().parse(new InputStreamReader((InputStream) input, "ISO-8859-1"));
                doc = factory.newDocumentBuilder().parse((InputStream) input);
            } else
                throw new Error("Unhandled input type: " + Util.tags(input));
            return doc;
        } catch (Throwable t) {
            t.printStackTrace();
        }
        /*catch (SAXException e) {
        // A parsing error occurred; the xml input is not valid
        } catch (ParserConfigurationException e) {
        } catch (IOException e) {
        }
        */
        return null;
    }

    public static String XMLEntityEncode(final String text) {
        // todo: if the result of this is simply destined for a writer, would
        // be more efficient to pass the writer in, and skip constructing new
        // StringBuffers.  Apache Commons has methods for this, presumably
        // for this reason -- eventually go ahead and use that:

        // will NOT introduce &quot; uneeded for us, possibly problematic, in
        // that this text may ultimately be handled by an HTML component which
        // won't handle "&quot;" (todo: test w/JLabel <html>)
        //return org.apache.commons.lang.StringEscapeUtils.escapeHtml(s);

        // will introduce &quot;
        //return org.apache.commons.lang.StringEscapeUtils.escapeXml(s);

        StringBuilder buf = null;
        final int len = (text == null ? -1 : text.length());

        for (int i = 0; i < len; i++) {
            final char c = text.charAt(i);
            String entity = null;

            switch (c) {
            // These are the five basic XML entities:
            // See http://commons.apache.org/lang/api/org/apache/commons/lang/StringEscapeUtils.html

            case '&':
                entity = "&amp;";
                break;
            case '<':
                entity = "&lt;";
                break;
            case '>':
                entity = "&gt;";
                break;
            case '"':
                entity = "&quot;";
                break;
            //case '\'': entity = "&apos;";    break; // not a legal HTML entity, even tho is a legal XML entity
            //case '\r': entity = "&#13;";     break; // test
            default:
                if (buf != null)
                    buf.append(c);
                continue;
            }

            // We've encountered something to encode: entity has been set:

            if (buf == null) {
                buf = new StringBuilder(len + 12);
                buf.append(text, 0, i);
            }
            buf.append(entity);

        }
        return buf == null ? text : buf.toString();

        //        for ( int i = 0; i < len; i++ ) {
        //            final char c = s.charAt( i );
        //            if (c >= 'a' && c <= 'z' || c >='A' && c <= 'Z' || c >= '0' && c <= '9') {
        //                buf.append( c );
        //            } else {
        //                final String entity;
        //                switch (c) {
        //                case '&':  entity = "&amp;";     break;
        //                case '<':  entity = "&lt;";      break;
        //                case '>':  entity = "&gt;";      break;
        //                case '"':  entity = "&quot;";    break;
        //              //case '\'': entity = "&apos;";    break; // apparently, not actually a legal entity
        //              //case '\r': entity = "&#13;";     break;
        //                default:   entity = null;
        //                }
        //                if (entity != null)
        //                    buf.append(entity);
        //                else
        //                    //buf.append( "&#" + (int)c + ";" );
        //                    buf.append(c);
        //            }
        //        }
        //       return buf.toString();

    }

    public static void iout(String s) {
        iout(depth, s);
    }

    public static void ioutln(String s) {
        ioutln(depth, s);
    }

    final static String TAB = "    ";

    public static void iout(int _depth, String s) {
        if (XML_OUTPUT) {
            for (int x = 0; x < _depth; x++)
                System.out.print(TAB);
            System.out.print(s);
        }
    }

    public static void ioutln(int _depth, String s) {
        if (XML_OUTPUT) {
            for (int x = 0; x < _depth; x++)
                System.out.print(TAB);
            System.out.println(s);
        }
    }

    public static void eoutln(int _depth, String s) {
        if (XML_OUTPUT) {
            for (int x = 0; x < _depth; x++)
                System.err.print(TAB);
            System.err.println(s);
        }
    }

    public static void eoutln(String s) {
        eoutln(depth, s);
    }

    public static void out(String s) {
        if (XML_OUTPUT)
            System.out.print(s == null ? "null" : s);
    }

    public static void outln(String s) {
        if (XML_OUTPUT)
            System.out.println(s == null ? "null" : s);
    }

    public static void errout(String s) {
        Log.debug(s == null ? "null" : s);
        //System.err.println("XMLIngest: " + s);
    }

    final static boolean ATTRIBUTES_IMMEDIATE = false; // false better for clearer XML output, true better for schema output (e.g., rss.version 1st, not last)
    final static boolean FOLD_TEXT = true; // default true: fold Node.TEXT_NODE(#text) and CDATA items into parent node

    final static boolean FOLD_KEYS = false; // auto-enabled if top-level item is "plist" (current breaks on JIRA XML if true)

    //final static int REPORT_THRESH = FOLD_KEYS ? 4 : 3;
    final static int REPORT_THRESH = 4;
    //final static int REPORT_THRESH = 1;

    final static char ATTR_SEPARATOR = '@';

    private static final String JIRA_VUE_URL = "http://bugs.atech.tufts.edu/secure/IssueNavigator.jspa?view=rss&pid=10001&tempMax=9999&reset=true&decorator=none";
    private static final String JIRA_SFRAIZE_COOKIE = "seraph.os.cookie=LkPlQkOlJlHkHiEpGiOiGjJjFi";

    private static InputStream getTestXMLStream() throws IOException {

        //         // SMF 2008-10-02: E.g. Craigslist XML streams use ISO-8859-1, which is provided in
        //         // HTML headers as "Content-Type: application/rss+xml; charset=ISO-8859-1", (tho not
        //         // in a special content-encoding header), and our current XML parser fails unless
        //         // the stream is read with this set: e.g.: [org.xml.sax.SAXParseException: Character
        //         // conversion error: "Unconvertible UTF-8 character beginning with 0x95" (line
        //         // number may be too low).]  Actually, in this case it turns out that providing a
        //         // default InputStreamReader (encoding not specified) as opposed to a direct
        //         // InputStream from the URLConnection works, and the XML parser is presumably then
        //         // finding and handling the "<?xml version="1.0" encoding="ISO-8859-1"?>" line at
        //         // the top of the XML stream
        //         final XmlSchema schema = new XmlSchema(conn.getURL(), itemKey);
        //         InputStream is = null;
        //         try {
        //             is = conn.getInputStream();
        //             errout("GOT INPUT STREAM: " + Util.tags(is));
        //         } catch (IOException e) {
        //             e.printStackTrace();
        //             return null;
        //         }
        //         final Document doc = parseXML(is, false);

        // Could also use a ROME API XmlReader(URLConnection) for handling
        // the input, which does it's own magic to figure out the encoding.
        // For more on the complexity of this issue, see:
        // http://diveintomark.org/archives/2004/02/13/xml-media-types

        URL url = new URL(JIRA_VUE_URL);
        URLConnection conn = url.openConnection();
        conn.setRequestProperty("Cookie", JIRA_SFRAIZE_COOKIE);
        errout("Opening connection to " + url);
        conn.connect();

        errout("Getting InputStream...");
        InputStream in = conn.getInputStream();
        errout("Got " + Util.tags(in));

        errout("Getting headers...");
        Map<String, List<String>> headers = conn.getHeaderFields();

        errout("HEADERS:");
        for (Map.Entry<String, List<String>> e : headers.entrySet()) {
            errout(e.getKey() + ": " + e.getValue());
        }

        return in;
    }

    public static void main(String[] args) throws IOException {
        DEBUG.Enabled = DEBUG.DR = DEBUG.IO = DEBUG.SCHEMA = true;

        tufts.vue.VUE.parseArgs(args);

        org.apache.log4j.Logger.getRootLogger().removeAllAppenders(); // need to do this or we get everything twice
        org.apache.log4j.Logger.getRootLogger()
                .addAppender(new org.apache.log4j.ConsoleAppender(tufts.vue.VUE.MasterLogPattern, "System.err"));

        //final XmlSchema schema = new RssSchema();

        errout("Max mem: " + Util.abbrevBytes(Runtime.getRuntime().maxMemory()));
        //getXMLStream();System.exit(0);

        final String file = args[0];
        final String key = args[1];

        Log.debug("File: " + file);
        Log.debug("Key: " + key);

        final InputSource is = new InputSource(file);
        is.setCharacterStream(new FileReader(file));

        //XMLIngest.XML_DEBUG = true;

        Schema schema = ingestXML(null, is, key);

        //schema.dumpSchema(System.err);

        System.err.println("\n");
        Log.debug("done");
    }

    //     public static void main(String[] args)
    //         throws IOException
    //     {
    //         //final XmlSchema schema = new RssSchema();

    //         errout("Max mem: " + Util.abbrevBytes(Runtime.getRuntime().maxMemory()));
    //         //getXMLStream();System.exit(0);

    //         Document doc;
    //         String src;

    //         if (args.length < 1) {
    //             doc = parseXML(getTestXMLStream(), false);
    //             src = JIRA_VUE_URL;
    //         } else {
    //             doc = parseXML(args[0], false);
    //             src = args[0];
    //         }
    //         //doc.normalizeDocument();
    //         errout("GOT DOC " + Util.tag(doc) + " " + doc);
    //         errout("InputEncoding: " + doc.getInputEncoding());
    //         errout("xmlEncoding: " + doc.getXmlEncoding());
    //         errout("xmlVersion: " + doc.getXmlVersion());
    //         errout("docType: " + Util.tags(doc.getDoctype()));
    //         errout("impl: " + Util.tags(doc.getImplementation()));
    //         errout("docElement: " + Util.tags(doc.getDocumentElement()));
    //         //out("element: " + Util.tags(doc.getDocumentElement()));

    //         outln("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
    //         outln("<!-- created by RSSTest " + new Date() + " from " + src + " -->");

    //         final XmlSchema schema = new XmlSchema(Util.tag(doc), "rss.channel.item");

    //         if (true)
    //             XPathExtract(schema, doc);
    //         else
    //             scanNode(schema, doc.getDocumentElement(), null, null);

    //         schema.dumpSchema(System.err);
    //     }

}