#!/usr/bin/python # htitlebytes - get html title from URL # import sys, urllib2, HTMLParser if len(sys.argv)<=1: print "usage: %s url ..." % sys.argv[0] sys.exit(1) # simple but pedantic html parser: tpj.com breaks it. class html(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self._data = {} self._open_tags = [] def handle_starttag(self, tag, attrs): self._open_tags.append(tag) def handle_endtag(self, tag): if len(self._open_tags)>0: self._open_tags.pop() def handle_data(self, data): if len(self._open_tags)>0: self._data[self._open_tags[-1]] = data def __getattr__(self,attr): if not self._data.has_key(attr): return "" return self._data[attr] def error(self,msg): # ignore all errors pass for url in sys.argv[1:]: print "%s: " % url, # TODO fake user agent "Schmozilla/v9.17 Platinum" # TODO referer "http://wizard.yellowbrick.oz" # as we only do http httplib would do also try: response = urllib2.urlopen(url) except: print " %s" % sys.exc_info()[1] sys.exit(1) # title is not in response parser = html() parser.feed(response.read()) parser.close() # force processing all data print parser.title