import HTMLParser import urllib import sys class parseLinks(HTMLParser.HTMLParser): def handle_starttag(self, tag, attrs): if tag == 'a': for name,value in attrs: if name == 'href': print value print self.get_starttag_text() lParser = parseLinks() lParser.feed(urllib.urlopen("http://www.python.org/index.html").read()) lParser.close()
21.21.Parse HTML | ||||
21.21.1. | Extract list of URLs in a web page | |||
21.21.2. | Opening HTML Documents | |||
21.21.3. | Retrieving Links from HTML Documents | |||
21.21.4. | Retrieving Images from HTML Documents | |||
21.21.5. | Retrieving Text from HTML Documents | |||
21.21.6. | Retrieving Cookies in HTML Documents | |||
21.21.7. | Adding Quotes to Attribute Values in HTML Documents | |||
21.21.8. | Basic HTML Title Retriever | |||
21.21.9. | HTML Title Retriever With Entity Support |