Parsing Sample with DOM : DOM « XML « Python Tutorial

from xml.dom import minidom, Node
import re, textwrap

class SampleScanner:
    def __init__(self, doc):
        for child in doc.childNodes:
            if child.nodeType == Node.ELEMENT_NODE and child.tagName == 'book':
                self.handleBook(child)

    def gettext(self, nodelist):
        retlist = []
        for node in nodelist:
            if node.nodeType == Node.TEXT_NODE:
                retlist.append(node.wholeText)
            elif node.hasChildNodes:
                retlist.append(self.gettext(node.childNodes))

        return re.sub('\s+', ' ', ''.join(retlist))

    def handleBook(self, node):
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == 'title':
                print "Book title is:", self.gettext(child.childNodes)
            if child.tagName == 'author':
                self.handleAuthor(child)
            if child.tagName == 'chapter':
                self.handleChapter(child)

    def handleAuthor(self, node):
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == 'name':
                self.handleAuthorName(child)
            elif child.tagName == 'affiliation':
                print "Author affiliation:", self.gettext([child])

    def handleAuthorName(self, node):
        surname = self.gettext(node.getElementsByTagName("last"))
        givenname = self.gettext(node.getElementsByTagName("first"))
        print "Author Name: %s, %s" % (surname, givenname)

    def handleChapter(self, node):
        print " *** Start of Chapter %s: %s" % (node.getAttribute('number'),
             self.gettext(node.getElementsByTagName('title')))
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == 'para':
                self.handlePara(child)

    def handlePara(self, node):
        partext = self.gettext([node])
        partext = textwrap.fill(partext)
        print partext
        print

doc = minidom.parse('sample.xml') 
SampleScanner(doc)

20.2.DOM
	20.2.1.	Processing XML
	20.2.2.	Accessing Child Nodes
	20.2.3.	Accessing Element Attributes
	20.2.4.	Adding a Node to a DOM Tree
	20.2.5.	Removing a Node from a DOM Tree
	20.2.6.	Parsing XML
	20.2.7.	Getting Child Nodes
	20.2.8.	toxml Works on Any Node
	20.2.9.	Child Nodes Can Be Text
	20.2.10.	All the Way to Text
	20.2.11.	Searching for XML Elements
	20.2.12.	Every Element Is Searchable
	20.2.13.	Searching Is Actually Recursive
	20.2.14.	Parsing XML from a File
	20.2.15.	Parsing XML from a URL
	20.2.16.	Parsing XML from a String
	20.2.17.	Class Names of Parsed XML Objects
	20.2.18.	Generating XML with DOM
	20.2.19.	Parsing Sample with DOM
	20.2.20.	Tree Generation with DOM