XMLParser.java Source code

Java tutorial

Introduction

Here is the source code for XMLParser.java

Source

//Copyright 2007-2008 David Yu dyuproject@gmail.com
//------------------------------------------------------------------------
//Licensed under the Apache License, Version 2.0 (the "License");
//you may not use this file except in compliance with the License.
//You may obtain a copy of the License at 
//http://www.apache.org/licenses/LICENSE-2.0
//Unless required by applicable law or agreed to in writing, software
//distributed under the License is distributed on an "AS IS" BASIS,
//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//See the License for the specific language governing permissions and
//limitations under the License.

//package com.dyuproject.util.xml;

import java.io.IOException;
import java.io.InputStreamReader;

/**
 * A simple XML parser that starts parsing right away and validates along the way. 
 * 
 * @author David Yu
 * @created Sep 17, 2008
 */

public final class XMLParser {

    private static final int STATE_EL_STARTING = 1;
    private static final int STATE_EL_STARTED = 2;
    private static final int STATE_EL_ENDING = 3;
    private static final int STATE_EL_ENDED = 4;

    private static final int STATE_EL_ATTR_NAME_START = 5;
    private static final int STATE_EL_ATTR_VALUE_START = 6;
    private static final int STATE_EL_ATTR_VALUE_END = 7;

    private static final int STATE_EL_TEXT = 8;
    private static final int STATE_COMMENT_STARTING = 9;
    private static final int STATE_COMMENT_DASH_START = 10;
    private static final int STATE_COMMENT_STARTED = 11;
    private static final int STATE_COMMENT_DASH_END = 12;
    private static final int STATE_COMMENT_ENDING = 13;
    private static final int STATE_IGNORE = 14;
    private static final int STATE_CDATA_STARTING = 15;
    private static final int STATE_CDATA_STARTED = 16;
    private static final int STATE_CDATA_ENDING = 17;
    private static final int STATE_CDATA_ENDED = 18;

    private static int __defaultBufferSize = 4096;

    public static void setDefaultBufferSize(int size) {
        __defaultBufferSize = size;
    }

    /**
     * Lazily parses the given {@code reader} using the default buffer size 
     * {@link #__defaultBufferSize}.  The parsing can be terminated by 
     * the {@link LazyHandler} {@code handler} at any point.
     */
    public static void parse(InputStreamReader reader, LazyHandler handler, boolean includeInnerText)
            throws IOException {
        parse(reader, handler, includeInnerText, __defaultBufferSize);
    }

    /**
     * Lazily parses the given {@code reader}.  The parsing can be terminated by 
     * the {@link LazyHandler} {@code handler} at any point.
     */
    public static void parse(InputStreamReader reader, LazyHandler handler, boolean includeInnerText,
            int bufferSize) throws IOException {
        if (handler == null)
            throw new IllegalArgumentException("LazyHandler arg must not be null.");
        char[] cbuf = new char[bufferSize];
        int offset = 0;
        int len = 0;
        int state = 0;
        int stateBeforeComment = 0;
        int mark = -1;
        int elwsMark = -1;
        int nsMark = -1;
        String attrName = null;
        String attrValue = null;
        boolean dq = true;
        boolean searchRoot = true;
        while ((len = reader.read(cbuf, offset, cbuf.length - offset)) != -1) {
            for (int i = 0; i < len; i++, offset++) {
                char c = cbuf[offset];
                switch (c) {
                case '<':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --< comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case 0:
                        state = STATE_EL_STARTING;
                        mark = offset;
                        continue;
                    case STATE_EL_ENDED:
                    case STATE_EL_STARTED:
                        stateBeforeComment = state;
                        state = STATE_EL_STARTING;
                        mark = offset;
                        continue;
                    case STATE_EL_TEXT:
                        stateBeforeComment = state;
                        state = STATE_EL_STARTING;
                        if (mark != -1 && includeInnerText) {
                            handler.characters(cbuf, mark + 1, offset - mark - 1);
                        }
                        mark = offset;
                        continue;
                    }
                    continue;

                case '>':
                    switch (state) {
                    case STATE_IGNORE:
                        if (stateBeforeComment == 0)
                            state = 0;
                        continue;
                    case STATE_EL_TEXT:// uncommented text
                    case STATE_COMMENT_STARTED:
                        continue;
                    case STATE_CDATA_ENDING:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_ENDING:
                        state = STATE_EL_ENDED;
                        if (!handler.endElement())
                            return;
                        elwsMark = -1;
                        continue;
                    case STATE_EL_ATTR_NAME_START:
                    case STATE_EL_STARTING:
                        if (elwsMark == -1) {
                            String name = null;
                            String namespace = null;
                            if (nsMark == -1)
                                name = new String(cbuf, mark + 1, offset - mark - 1).trim();
                            else {
                                namespace = new String(cbuf, mark + 1, nsMark - mark - 1).trim();
                                name = new String(cbuf, nsMark + 1, offset - nsMark - 1).trim();
                            }
                            if (searchRoot) {
                                if (!handler.rootElement(name, namespace))
                                    return;
                                searchRoot = false;
                            } else if (!handler.startElement(name, namespace))
                                return;
                        }
                        nsMark = -1;
                        elwsMark = -1;
                        state = STATE_EL_STARTED;
                        mark = -1;
                        continue;
                    case STATE_COMMENT_ENDING:
                        state = stateBeforeComment;
                        continue;
                    case STATE_CDATA_ENDED:
                        state = STATE_EL_TEXT;
                        if (mark != -1 && includeInnerText) {
                            handler.characters(cbuf, mark + 1, offset - 2 - mark - 1);
                        }
                        mark = offset;
                        continue;
                    }
                    continue;

                case '/':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --/ comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_ATTR_NAME_START:
                        mark = -1;
                        state = STATE_EL_ENDING;
                        continue;

                    case STATE_EL_STARTED:
                        state = STATE_EL_TEXT;
                        mark = offset - 1;
                        continue;
                    case STATE_EL_STARTING:
                        if (mark + 1 != offset) {
                            String name = null;
                            String namespace = null;
                            if (nsMark == -1)
                                name = new String(cbuf, mark + 1, offset - mark - 1).trim();
                            else {
                                namespace = new String(cbuf, mark + 1, nsMark - mark - 1).trim();
                                name = new String(cbuf, nsMark + 1, offset - nsMark - 1).trim();
                            }
                            if (searchRoot) {
                                if (!handler.rootElement(name, namespace))
                                    return;
                                searchRoot = false;
                            } else if (!handler.startElement(name, namespace))
                                return;
                        }
                        state = STATE_EL_ENDING;
                        elwsMark = -1;
                        nsMark = -1;
                        mark = -1;
                        continue;

                    }
                    continue;

                case ':':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --: comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_STARTING:
                        if (nsMark != -1)
                            throw new IOException("invalid xml.");
                        nsMark = offset;
                        continue;
                    }
                    continue;
                case '?':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --? comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_STARTING:
                        // uncommented text
                        if (stateBeforeComment == STATE_EL_TEXT)
                            continue;
                        state = STATE_COMMENT_STARTING;
                        mark = -1;
                        continue;
                    }
                    continue;
                case '!':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --! comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_STARTING:
                        state = STATE_COMMENT_STARTING;
                        mark = -1;
                        continue;
                    }
                    continue;
                case '[':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --[ comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_COMMENT_STARTING:
                        state = STATE_CDATA_STARTING;
                        if (mark != -1 && includeInnerText) {
                            handler.characters(cbuf, mark + 1, offset - 2 - mark - 1);
                        }
                        mark = -1;
                        continue;
                    case STATE_CDATA_STARTING:
                        state = STATE_CDATA_STARTED;
                        mark = offset;
                        continue;
                    }
                    continue;
                case ']':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --[ comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_CDATA_STARTED:
                        state = STATE_CDATA_ENDING;
                        continue;
                    case STATE_CDATA_ENDING:
                        state = STATE_CDATA_ENDED;
                        continue;
                    }
                    continue;

                case '-':
                    switch (state) {
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_STARTING:
                        state = STATE_COMMENT_DASH_START;
                        continue;
                    case STATE_COMMENT_DASH_START:
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_COMMENT_STARTED:
                        state = STATE_COMMENT_DASH_END;
                        continue;
                    case STATE_COMMENT_DASH_END:
                        state = STATE_COMMENT_ENDING;
                        continue;
                    case STATE_COMMENT_ENDING:// handle ---- text
                        state = STATE_COMMENT_STARTED;
                        continue;
                    }
                    continue;

                case '=':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --= comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_ATTR_NAME_START:
                        state = STATE_EL_ATTR_VALUE_START;
                        attrName = new String(cbuf, mark + 1, offset - mark - 1).trim();
                        mark = -1;
                        continue;
                    }
                    continue;

                case '\'':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --' comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_ATTR_VALUE_START:
                        dq = false;
                        state = STATE_EL_ATTR_VALUE_END;
                        mark = offset;
                        continue;

                    case STATE_EL_ATTR_VALUE_END:
                        if (dq)
                            continue;
                        state = STATE_EL_STARTING;
                        attrValue = new String(cbuf, mark + 1, offset - mark - 1).trim();
                        handler.attribute(attrName, attrValue);
                        attrName = null;
                        attrValue = null;
                        mark = -1;
                        continue;

                    }
                    continue;
                case '"':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_ENDING://handle --" comments
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;
                    case STATE_EL_ATTR_VALUE_START:
                        dq = true;
                        state = STATE_EL_ATTR_VALUE_END;
                        mark = offset;
                        continue;

                    case STATE_EL_ATTR_VALUE_END:
                        if (!dq)
                            continue;
                        state = STATE_EL_STARTING;
                        attrValue = new String(cbuf, mark + 1, offset - mark - 1).trim();
                        handler.attribute(attrName, attrValue);
                        attrName = null;
                        attrValue = null;
                        mark = -1;
                        continue;

                    }
                    continue;

                case ' ':
                case '\t':
                case '\r':
                case '\n':
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_EL_STARTING:
                        state = STATE_EL_ATTR_NAME_START;
                        if (elwsMark == -1) {
                            String name = null;
                            String namespace = null;
                            if (nsMark == -1)
                                name = new String(cbuf, mark + 1, offset - mark - 1).trim();
                            else {
                                namespace = new String(cbuf, mark + 1, nsMark - mark - 1).trim();
                                name = new String(cbuf, nsMark + 1, offset - nsMark - 1).trim();
                            }
                            if (searchRoot) {
                                if (!handler.rootElement(name, namespace))
                                    return;
                                searchRoot = false;
                            } else if (!handler.startElement(name, namespace))
                                return;
                        }
                        nsMark = -1;
                        elwsMark = offset;
                        mark = offset;
                        continue;
                    }
                    continue;

                default:
                    switch (state) {
                    case STATE_COMMENT_STARTED:
                    case STATE_IGNORE:
                        continue;
                    case STATE_COMMENT_DASH_END:
                    case STATE_COMMENT_ENDING:
                        state = STATE_COMMENT_STARTED;
                        continue;
                    case STATE_CDATA_ENDING:
                    case STATE_CDATA_ENDED:
                        state = STATE_CDATA_STARTED;
                        continue;

                    case STATE_EL_STARTED:
                        state = STATE_EL_TEXT;
                        if (includeInnerText)
                            mark = offset - 1;
                        continue;

                    case STATE_COMMENT_STARTING:
                        if (stateBeforeComment != 0)
                            throw new IOException("invalid xml.");

                        mark = -1;
                        state = STATE_IGNORE;
                        continue;
                    case STATE_COMMENT_DASH_START:
                        throw new IOException("invalid xml.");
                    }
                    continue;
                }
            }

            if (mark == -1)
                offset = 0;
            else {
                if (state == STATE_EL_TEXT || state == STATE_CDATA_STARTED) {
                    if (includeInnerText)
                        handler.characters(cbuf, mark + 1, offset - mark - 2);
                    offset = 0;
                    mark = -1;
                } else {
                    // move to the front
                    int copyLen = offset - mark;
                    System.arraycopy(cbuf, mark, cbuf, 0, copyLen);
                    offset = len;
                    mark = 0;
                }
            }
        }
    }

}

interface LazyHandler {

    /**
     * Callback that gets called only once upon traversing the root xml element.
     */
    public boolean rootElement(String name, String namespace);

    /**
     * Callback after traversing the start of xml elements (E.g &lt;foo&gt;).
     */
    public boolean startElement(String name, String namespace);

    /**
     * Callback after traversing the end of xml elements (E.g &lt;/foo&gt; or 
     * /&gt;).
     */
    public boolean endElement();

    /**
     * Callback after traversing the attributes of an element.
     */
    public void attribute(String name, String value);

    /**
     * Callback after traversing the text content of an element.
     */
    public void characters(char[] data, int start, int length);

}

/*
    
//Copyright 2007-2008 David Yu dyuproject@gmail.com
//------------------------------------------------------------------------
//Licensed under the Apache License, Version 2.0 (the "License");
//you may not use this file except in compliance with the License.
//You may obtain a copy of the License at 
//http://www.apache.org/licenses/LICENSE-2.0
//Unless required by applicable law or agreed to in writing, software
//distributed under the License is distributed on an "AS IS" BASIS,
//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//See the License for the specific language governing permissions and
//limitations under the License.
    
    
//package com.dyuproject.util.xml;
    
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
    
import junit.framework.TestCase;
    
//* @author David Yu
//* @created Sep 18, 2008
    
public class XMLParserTest extends TestCase
{
      
  static final String prefix = "com/dyuproject/util/xml/";
      
  static URL getResource(String resource)
  {
  return Thread.currentThread().getContextClassLoader().getResource(prefix + resource);
  }
      
  public void testNamespace() throws Exception
  {
  String url = "http://open.login.yahooapis.com/openid20/www.yahoo.com/xrds";
  HttpURLConnection con = (HttpURLConnection)new URL(url).openConnection();
  con.setRequestMethod("GET");
  con.setDefaultUseCaches(false);
  con.setInstanceFollowRedirects(false);
  con.setDoInput(true);
  con.connect();
  SimpleHandler handler = new SimpleHandler();
  InputStreamReader reader = new InputStreamReader(con.getInputStream());
  try
  {
      XMLParser.parse(reader, handler, true);
      Node xrds = handler.getNode();
      assertEquals("xrds", xrds.getNamespace());
      Node xrd = xrds.getNode("xrd");
      Node service = xrd.getNode("service");
      assertTrue(0!=service.getNodes("type").size());
      assertEquals("xrds", service.getLastNode().getNamespace());
  }
  finally
  {
      reader.close();
      con.disconnect();
  }   
  }
      
  public void testSimple() throws Exception
  {
  SimpleHandler handler = new SimpleHandler();
  InputStreamReader reader = new InputStreamReader(getResource("simple.xml").openStream());
  try
  {
      XMLParser.parse(reader, handler, true);
      Node root = handler.getNode();
      assertEquals("root", root.getName());
      Node foo = root.getNode("foo");
      assertNotNull(foo);
      assertEquals(foo.getText().toString(), "baz");
      Node bar = foo.getNode("bar");
      assertNotNull(bar);
  }
  finally
  {
      reader.close();
  }  
  }
      
  public void testTrimAndCDATA() throws Exception
  {
  SimpleHandler handler = new SimpleHandler();
  InputStreamReader reader = new InputStreamReader(getResource("xrds").openStream());
  try
  {
      XMLParser.parse(reader, handler, true);
      Node xrds = handler.getNode();
      assertEquals("xrds", xrds.getNamespace());
      Node xrd = xrds.getNode("xrd");
      Node service = xrd.getNode("service");
      assertTrue(0!=service.getNodes("type").size());
      assertEquals("xrds", service.getLastNode().getNamespace());
      Node foo = xrds.getNode("FOO");
      assertNotNull(foo);
      assertEquals(foo.getText().toString(), "I am a cdata text. yep\nyep");
      System.err.println(foo.getText().toString());
  }
  finally
  {
      reader.close();
  }        
  }
      
  public void testSiteXrds() throws Exception
  {
  SimpleHandler handler = new SimpleHandler();
  InputStreamReader reader = new InputStreamReader(getResource("site-xrds").openStream());
  try
  {
      XMLParser.parse(reader, handler, true);
      Node xrds = handler.getNode();
      assertNotNull(xrds);
      assertEquals("xrds", xrds.getNamespace());
      assertEquals("XRDS", xrds.getName());
      Node signature = xrds.getNode("Signature");
      assertNotNull(signature);
      assertEquals("ds", signature.getNamespace());
      Node xrd = xrds.getNode("XRD");
      assertNotNull(xrd);
      Node canonicalID = xrd.getNode("CanonicalID");
      assertNotNull(canonicalID);
      assertEquals("dyuproject.com", canonicalID.getText().toString());
      Node service = xrd.getNode("Service");
      assertNotNull(service);
      Node uri = service.getNode("URI");
      assertNotNull(uri);
      assertEquals("https://www.google.com/a/dyuproject.com/o8/ud?be=o8", uri.getText().toString());
  }
  finally
  {
      reader.close();
  }
  }
    
}
*/