#!/usr/bin/python
# churl - check urls

import sys

# head request
import urllib
def valid(url):
    try:
        conn = urllib.urlopen(url)
        return 1
    except:
        return 0

# parser class as in xurl
from HTMLParser import HTMLParser
from sets import Set as set # not needed in 2.4
class myParser(HTMLParser):
    def __init__(self, url):
        self.baseUrl = url[:url.rfind('/')]
        HTMLParser.__init__(self)
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if attrs[0][0] == 'href':
                if attrs[0][1].find(':') == -1:
                    # we need to add the base URL.
                    self.urls.add(self.baseUrl + '/' + attrs[0][1])
                else:
                    self.urls.add(attrs[0][1])

if len(sys.argv)<=1:
    print "usage: %s <start_url>" % (sys.argv[0])
    sys.exit(1)
    
base_url = sys.argv[1]
print base_url+":"
p = myParser(base_url)
s = urllib.urlopen(base_url)
data = s.read()
p.feed(data)
for link in p.urls._data.keys():
    state = "UNKNOWN URL"
    if link.startswith("http:"):
        state = "BAD"
        if valid(link):
            state = "OK"
    print "  %s: %s" % (link, state)