#!/usr/bin/python
# htitlebytes - get html title from URL
#
import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
print "usage: %s url ..." % sys.argv[0]
sys.exit(1)
# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self._data = {}
self._open_tags = []
def handle_starttag(self, tag, attrs):
self._open_tags.append(tag)
def handle_endtag(self, tag):
if len(self._open_tags)>0:
self._open_tags.pop()
def handle_data(self, data):
if len(self._open_tags)>0:
self._data[self._open_tags[-1]] = data
def __getattr__(self,attr):
if not self._data.has_key(attr):
return ""
return self._data[attr]
def error(self,msg):
# ignore all errors
pass
for url in sys.argv[1:]:
print "%s: " % url,
# TODO fake user agent "Schmozilla/v9.17 Platinum"
# TODO referer "http://wizard.yellowbrick.oz"
# as we only do http httplib would do also
try:
response = urllib2.urlopen(url)
except:
print " %s" % sys.exc_info()[1]
sys.exit(1)
# title is not in response
parser = html()
parser.feed(response.read())
parser.close() # force processing all data
print parser.title