#!/usr/bin/python
# xurl - extract unique, sorted list of links from URL

from HTMLParser import HTMLParser
import urllib
from sets import Set as set # not needed in 2.4
class myParser(HTMLParser):
    def __init__(self, url):
        self.baseUrl = url[:url.rfind('/')]
        HTMLParser.__init__(self)
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if attrs[0][0] == 'href':
                if attrs[0][1].find(':') == -1:
                    # we need to add the base URL.
                    self.urls.add(self.baseUrl + '/' + attrs[0][1])
                else:
                    self.urls.add(attrs[0][1])
url = 'http://www.perl.com/CPAN'
p = myParser(url)
s = urllib.urlopen(url)
data = s.read()
p.feed(data)
urllist = p.urls._data.keys()
urllist.sort()
print '\n'.join(urllist)