Normalizing Character Data Before Output
import sys
from xml.parsers import expat
def normalize_whitespace(text):
return " ".join(text.split())
class SimpleParse:
def __init__(self):
self.parser = expat.ParserCreate()
self.parser.StartElementHandler = self.start_element
self.parser.EndElementHandler = self.end_element
self.parser.CharacterDataHandler = self.character_data
self.cdata = [ ]
def parse(self,file):
self.parser.ParseFile(file)
def print_cdata(self):
txt = normalize_whitespace("".join(self.cdata))
if txt: print normalize_whitespace(txt)
self.cdata = [ ]
def start_element(self,name,attrs):
self.print_cdata()
print "Start:",name,attrs
def character_data(self,data):
self.cdata.append(data)
def end_element(self,name):
self.print_cdata()
print "End:", name
p = SimpleParse()
p.parse(open(sys.argv[1]))
Related examples in the same category