#!/usr/bin/python # titlebytes - find the title and size of documents # # differences to perl # # * no URI::Heuristics # * perl LWP supports fetching files from local system # * fetching a title from ftp or file doesnt work in perl either. import sys, urllib2, HTMLParser if len(sys.argv)<=1: print "usage: %s url" % sys.argv[0] sys.exit(1) raw_url = sys.argv[1] # python has no equivalent to pearls URI::Heuristics, which # would do some guessing like : # # perl -> http://www.perl.com # www.oreilly.com -> http://www.oreilly.com # ftp.funet.fi -> ftp://ftp.funet.fi # /etc/passwd -> file:/etc/passwd # simple but pedantic html parser: tpj.com breaks it. class html(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self._data = {} self._open_tags = [] def handle_starttag(self, tag, attrs): self._open_tags.append(tag) def handle_endtag(self, tag): if len(self._open_tags)>0: self._open_tags.pop() def handle_data(self, data): if len(self._open_tags)>0: self._data[self._open_tags[-1]] = data def __getattr__(self,attr): if not self._data.has_key(attr): return "" return self._data[attr] url = raw_url print "%s =>\n\t" % url, # TODO fake user agent "Schmozilla/v9.17 Platinum" # TODO referer "http://wizard.yellowbrick.oz" # as we only do http httplib would do also try: response = urllib2.urlopen(url) except: print " %s" % sys.exc_info()[1].reason[1] sys.exit(1) # title is not in response data = response.read() parser = html() parser.feed(data) parser.close() # force processing all data count = len(data.split("\n")) bytes = len(data) print "%s (%d lines, %d bytes)" % (parser.title, count, bytes) # omly bytes is in response.info()