Use SGML paraser : sgmllib « XML « Python Tutorial






import sgmllib
import string

filename = "index.html"
class CleanExit(Exception):
    pass

class Titlefinder(sgmllib.SGMLParser):
    def __init__(self, verbose=0):
        sgmllib.SGMLParser.__init__(self, verbose)
        self.title = self.data = None
    def start_title(self, attributes):
        self.data = []
    def end_title(self):
        self.title = string.join(self.data, "")
        raise CleanExit
    def handle_data(self, data):
        if self.data is not None:
            self.data.append(data)

def get_title(filehandle):
    Parser = Titlefinder()
    try:
        while 1:
            sgmldata = filehandle.read(1024)
            if not sgmldata:
                break
            Parser.feed(sgmldata)
        Parser.close()
    except CleanExit:
        return Parser.title
    return None

filehandle = open(filename)
title = get_title(filehandle)

print "The page's title is: %s" % (title)








20.7.sgmllib
20.7.1.Use SGML paraser