#!/usr/bin/python # churl - check urls import sys # head request import urllib def valid(url): try: conn = urllib.urlopen(url) return 1 except: return 0 # parser class as in xurl from HTMLParser import HTMLParser from sets import Set as set # not needed in 2.4 class myParser(HTMLParser): def __init__(self, url): self.baseUrl = url[:url.rfind('/')] HTMLParser.__init__(self) def reset(self): self.urls = set() HTMLParser.reset(self) def handle_starttag(self, tag, attrs): if tag == 'a': if attrs[0][0] == 'href': if attrs[0][1].find(':') == -1: # we need to add the base URL. self.urls.add(self.baseUrl + '/' + attrs[0][1]) else: self.urls.add(attrs[0][1]) if len(sys.argv)<=1: print "usage: %s " % (sys.argv[0]) sys.exit(1) base_url = sys.argv[1] print base_url+":" p = myParser(base_url) s = urllib.urlopen(base_url) data = s.read() p.feed(data) for link in p.urls._data.keys(): state = "UNKNOWN URL" if link.startswith("http:"): state = "BAD" if valid(link): state = "OK" print " %s: %s" % (link, state)