Java tutorial
/* * File : SimpleXMLParserDocumentImpl.java * Created : 5 Oct. 2003 * By : Parg * * Azureus - a Java Bittorrent client * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details ( see the LICENSE file ). * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.gudy.azureus2.pluginsimpl.local.utils.xml.simpleparser; import javax.xml.parsers.*; import org.xml.sax.*; import org.apache.commons.lang.Entities; import org.gudy.azureus2.core3.util.AENetworkClassifier; import org.gudy.azureus2.core3.util.Constants; import org.gudy.azureus2.core3.util.Debug; import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocument; import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentAttribute; import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentException; import org.gudy.azureus2.plugins.utils.xml.simpleparser.SimpleXMLParserDocumentNode; import org.w3c.dom.*; import com.aelitis.azureus.core.util.UncloseableInputStream; import java.io.*; import java.net.InetAddress; import java.net.URL; import java.net.URLConnection; import java.net.UnknownHostException; import java.util.*; public class SimpleXMLParserDocumentImpl implements SimpleXMLParserDocument { private static DocumentBuilderFactory dbf_singleton; private URL source_url; private Document document; private SimpleXMLParserDocumentNodeImpl root_node; public SimpleXMLParserDocumentImpl(File file) throws SimpleXMLParserDocumentException { try { create(new FileInputStream(file)); } catch (Throwable e) { throw (new SimpleXMLParserDocumentException(e)); } } public SimpleXMLParserDocumentImpl(String data) throws SimpleXMLParserDocumentException { try { create(new ByteArrayInputStream(data.getBytes(Constants.DEFAULT_ENCODING))); } catch (UnsupportedEncodingException e) { } } /** * @deprecated * @param _input_stream * @throws SimpleXMLParserDocumentException */ public SimpleXMLParserDocumentImpl(InputStream _input_stream) throws SimpleXMLParserDocumentException { this(null, _input_stream); } public SimpleXMLParserDocumentImpl(URL _source_url, InputStream _input_stream) throws SimpleXMLParserDocumentException { source_url = _source_url; create(_input_stream); } protected static synchronized DocumentBuilderFactory getDBF() { // getting the factory involves a fait bit of work - cache it if (dbf_singleton == null) { dbf_singleton = DocumentBuilderFactory.newInstance(); // Set namespaceAware to true to get a DOM Level 2 tree with nodes // containing namesapce information. This is necessary because the // default value from JAXP 1.0 was defined to be false. dbf_singleton.setNamespaceAware(true); // Set the validation mode to either: no validation, DTD // validation, or XSD validation dbf_singleton.setValidating(false); // Optional: set various configuration options dbf_singleton.setIgnoringComments(true); dbf_singleton.setIgnoringElementContentWhitespace(true); dbf_singleton.setCoalescing(true); // The opposite of creating entity ref nodes is expanding them inline // NOTE that usage of, e.g. "&" in text results in an entity ref. e.g. // if ("BUY".equals (type) " // ENT_REF: nodeName="amp" // TEXT: nodeName="#text" nodeValue="&" dbf_singleton.setExpandEntityReferences(true); } return (dbf_singleton); } private void create(InputStream _input_stream) throws SimpleXMLParserDocumentException { // make sure we can mark the stream to permit later recovery if needed if (!_input_stream.markSupported()) { _input_stream = new BufferedInputStream(_input_stream); } _input_stream.mark(100 * 1024); // prevent the parser from screwing with our stream by closing it UncloseableInputStream uc_is = new UncloseableInputStream(_input_stream); try { createSupport(uc_is); } catch (SimpleXMLParserDocumentException e) { String msg = Debug.getNestedExceptionMessage(e); if ((msg.contains("entity") && msg.contains("was referenced")) || msg.contains("entity reference")) { try { // nasty hack to try and handle HTML entities that some annoying feeds include :( _input_stream.reset(); createSupport(new EntityFudger(_input_stream)); return; } catch (Throwable f) { } } //Debug.out( e ); throw (e); } finally { try { _input_stream.close(); } catch (Throwable e) { } } } private void createSupport(InputStream input_stream) throws SimpleXMLParserDocumentException { try { DocumentBuilderFactory dbf = getDBF(); // Step 2: create a DocumentBuilder that satisfies the constraints // specified by the DocumentBuilderFactory DocumentBuilder db = dbf.newDocumentBuilder(); // Set an ErrorHandler before parsing OutputStreamWriter errorWriter = new OutputStreamWriter(System.err); MyErrorHandler error_handler = new MyErrorHandler(new PrintWriter(errorWriter, true)); db.setErrorHandler(error_handler); db.setEntityResolver(new EntityResolver() { public InputSource resolveEntity(String publicId, String systemId) { // System.out.println( publicId + ", " + systemId ); // handle bad DTD external refs try { URL url = new URL(systemId); if (source_url != null) { String net = AENetworkClassifier.categoriseAddress(source_url.getHost()); if (net != AENetworkClassifier.AT_PUBLIC) { if (AENetworkClassifier.categoriseAddress(url.getHost()) != net) { return new InputSource(new ByteArrayInputStream( "<?xml version='1.0' encoding='UTF-8'?>".getBytes())); } } } String host = url.getHost(); InetAddress.getByName(host); // try connecting too as connection-refused will also bork XML parsing InputStream is = null; try { URLConnection con = url.openConnection(); con.setConnectTimeout(15 * 1000); con.setReadTimeout(15 * 1000); is = con.getInputStream(); byte[] buffer = new byte[32]; int pos = 0; while (pos < buffer.length) { int len = is.read(buffer, pos, buffer.length - pos); if (len <= 0) { break; } pos += len; } String str = new String(buffer, "UTF-8").trim().toLowerCase(Locale.US); if (!str.contains("<?xml")) { // not straightforward to check for naked DTDs, could be lots of <!-- commentry preamble which of course can occur // in HTML too buffer = new byte[32000]; pos = 0; while (pos < buffer.length) { int len = is.read(buffer, pos, buffer.length - pos); if (len <= 0) { break; } pos += len; } str += new String(buffer, "UTF-8").trim().toLowerCase(Locale.US); if (str.contains("<html") && str.contains("<head")) { throw (new Exception("Bad DTD")); } } } catch (Throwable e) { return new InputSource( new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes())); } finally { if (is != null) { try { is.close(); } catch (Throwable e) { } } } return (null); } catch (UnknownHostException e) { return new InputSource( new ByteArrayInputStream("<?xml version='1.0' encoding='UTF-8'?>".getBytes())); } catch (Throwable e) { return (null); } } }); // Step 3: parse the input file document = db.parse(input_stream); SimpleXMLParserDocumentNodeImpl[] root_nodes = parseNode(document, false); int root_node_count = 0; // remove any processing instructions such as <?xml-stylesheet for (int i = 0; i < root_nodes.length; i++) { SimpleXMLParserDocumentNodeImpl node = root_nodes[i]; if (node.getNode().getNodeType() != Node.PROCESSING_INSTRUCTION_NODE) { root_node = node; root_node_count++; } } if (root_node_count != 1) { throw (new SimpleXMLParserDocumentException( "invalid document - " + root_nodes.length + " root elements")); } } catch (Throwable e) { throw (new SimpleXMLParserDocumentException(e)); } } public String getName() { return (root_node.getName()); } public String getFullName() { return (root_node.getFullName()); } public String getNameSpaceURI() { return (root_node.getNameSpaceURI()); } public String getValue() { return (root_node.getValue()); } public SimpleXMLParserDocumentNode[] getChildren() { return (root_node.getChildren()); } public SimpleXMLParserDocumentNode getChild(String name) { return (root_node.getChild(name)); } public SimpleXMLParserDocumentAttribute[] getAttributes() { return (root_node.getAttributes()); } public SimpleXMLParserDocumentAttribute getAttribute(String name) { return (root_node.getAttribute(name)); } public void print() { PrintWriter pw = new PrintWriter(System.out); print(pw); pw.flush(); } public void print(PrintWriter pw) { root_node.print(pw, ""); } // idea is to flatten out any unwanted structure. We just want the resultant // tree to have nodes for each nesting element and leaves denoting name/value bits protected SimpleXMLParserDocumentNodeImpl[] parseNode(Node node, boolean skip_this_node) { int type = node.getNodeType(); if ((type == Node.ELEMENT_NODE || type == Node.PROCESSING_INSTRUCTION_NODE) && !skip_this_node) { return (new SimpleXMLParserDocumentNodeImpl[] { new SimpleXMLParserDocumentNodeImpl(this, node) }); } Vector v = new Vector(); for (Node child = node.getFirstChild(); child != null; child = child.getNextSibling()) { SimpleXMLParserDocumentNodeImpl[] kids = parseNode(child, false); for (int i = 0; i < kids.length; i++) { v.addElement(kids[i]); } } SimpleXMLParserDocumentNodeImpl[] res = new SimpleXMLParserDocumentNodeImpl[v.size()]; v.copyInto(res); return (res); } private static class MyErrorHandler implements ErrorHandler { /** Error handler output goes here */ //private PrintWriter out; MyErrorHandler(PrintWriter out) { //this.out = out; } /** * Returns a string describing parse exception details */ private String getParseExceptionInfo(SAXParseException spe) { String systemId = spe.getSystemId(); if (systemId == null) { systemId = "null"; } String info = "URI=" + systemId + " Line=" + spe.getLineNumber() + ": " + spe.getMessage(); return info; } // The following methods are standard SAX ErrorHandler methods. // See SAX documentation for more info. public void warning(SAXParseException spe) throws SAXException { // out.println("Warning: " + getParseExceptionInfo(spe)); } public void error(SAXParseException spe) throws SAXException { String message = "Error: " + getParseExceptionInfo(spe); throw new SAXException(message); } public void fatalError(SAXParseException spe) throws SAXException { String message = "Fatal Error: " + getParseExceptionInfo(spe); throw new SAXException(message, spe); } } private static class EntityFudger extends InputStream { private InputStream is; char[] buffer = new char[16]; int buffer_pos = 0; char[] insertion = new char[16]; int insertion_pos = 0; int insertion_len = 0; public EntityFudger(InputStream _is) { is = _is; } @Override public int read() throws IOException { if (insertion_len > 0) { int result = insertion[insertion_pos++] & 0xff; if (insertion_pos == insertion_len) { insertion_pos = 0; insertion_len = 0; } return (result); } while (true) { int b = is.read(); if (b < 0) { // end of file if (buffer_pos == 0) { return (b); } else if (buffer_pos == 1) { buffer_pos = 0; return (buffer[0] & 0xff); } else { System.arraycopy(buffer, 1, insertion, 0, buffer_pos - 1); insertion_len = buffer_pos - 1; insertion_pos = 0; buffer_pos = 0; return (buffer[0] & 0xff); } } // normal byte if (buffer_pos == 0) { if (b == '&') { buffer[buffer_pos++] = (char) b; } else { return (b); } } else { if (buffer_pos == buffer.length - 1) { // buffer's full, give up buffer[buffer_pos++] = (char) b; System.arraycopy(buffer, 0, insertion, 0, buffer_pos); buffer_pos = 0; insertion_pos = 0; insertion_len = buffer_pos; return (insertion[insertion_pos++]); } else { if (b == ';') { // got some kind of reference mebe buffer[buffer_pos++] = (char) b; String ref = new String(buffer, 1, buffer_pos - 2).toLowerCase(Locale.US); String replacement; if (ref.equals("amp") || ref.equals("lt") || ref.equals("gt") || ref.equals("quot") || ref.equals("apos") || ref.startsWith("#")) { replacement = new String(buffer, 0, buffer_pos); } else { int num = Entities.HTML40.entityValue(ref); if (num != -1) { replacement = "&#" + num + ";"; } else { replacement = new String(buffer, 0, buffer_pos); } } char[] chars = replacement.toCharArray(); System.arraycopy(chars, 0, insertion, 0, chars.length); buffer_pos = 0; insertion_pos = 0; insertion_len = chars.length; return (insertion[insertion_pos++]); } else { buffer[buffer_pos++] = (char) b; char c = (char) b; if (!Character.isLetterOrDigit(c)) { // handle naked & if (buffer_pos == 2 && buffer[0] == '&') { char[] chars = "&".toCharArray(); System.arraycopy(chars, 0, insertion, 0, chars.length); buffer_pos = 0; insertion_pos = 0; insertion_len = chars.length; // don't forget the char we just read insertion[insertion_len++] = (char) b; return (insertion[insertion_pos++]); } else { // not a valid entity reference System.arraycopy(buffer, 0, insertion, 0, buffer_pos); buffer_pos = 0; insertion_pos = 0; insertion_len = buffer_pos; return (insertion[insertion_pos++]); } } } } } } } public void close() throws IOException { is.close(); } public long skip(long n) throws IOException { // meh, vague attempt here if (insertion_len > 0) { // buffer is currently empty, shove remaining into buffer to unify processing int rem = insertion_len - insertion_pos; System.arraycopy(insertion, insertion_pos, buffer, 0, rem); insertion_pos = 0; insertion_len = 0; buffer_pos = rem; } if (n <= buffer_pos) { // skip is <= buffer contents int rem = buffer_pos - (int) n; System.arraycopy(buffer, (int) n, insertion, 0, rem); insertion_pos = 0; insertion_len = rem; return (n); } int to_skip = buffer_pos; buffer_pos = 0; return (is.skip(n - to_skip) + to_skip); } public int available() throws IOException { return (buffer_pos + is.available()); } } }