#----------------------------- import urllib content = urllib.urlopen(url).read() try: import urllib content = urllib.urlopen(url).read() except IOError: print "could not get %s" % url #----------------------------- # download the following standalone program #!/usr/bin/python # titlebytes - find the title and size of documents # # differences to perl # # * no URI::Heuristics # * perl LWP supports fetching files from local system # * fetching a title from ftp or file doesnt work in perl either. import sys, urllib2, HTMLParser if len(sys.argv)<=1: print "usage: %s url" % sys.argv[0] sys.exit(1) raw_url = sys.argv[1] # python has no equivalent to pearls URI::Heuristics, which # would do some guessing like : # # perl -> http://www.perl.com # www.oreilly.com -> http://www.oreilly.com # ftp.funet.fi -> ftp://ftp.funet.fi # /etc/passwd -> file:/etc/passwd # simple but pedantic html parser: tpj.com breaks it. class html(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self._data = {} self._open_tags = [] def handle_starttag(self, tag, attrs): self._open_tags.append(tag) def handle_endtag(self, tag): if len(self._open_tags)>0: self._open_tags.pop() def handle_data(self, data): if len(self._open_tags)>0: self._data[self._open_tags[-1]] = data def __getattr__(self,attr): if not self._data.has_key(attr): return "" return self._data[attr] url = raw_url print "%s =>\n\t" % url, # TODO fake user agent "Schmozilla/v9.17 Platinum" # TODO referer "http://wizard.yellowbrick.oz" # as we only do http httplib would do also try: response = urllib2.urlopen(url) except: print " %s" % sys.exc_info()[1].reason[1] sys.exit(1) # title is not in response data = response.read() parser = html() parser.feed(data) parser.close() # force processing all data count = len(data.split("\n")) bytes = len(data) print "%s (%d lines, %d bytes)" % (parser.title, count, bytes) # omly bytes is in response.info() |
# GET method import httplib conn = httplib.HTTPConnection('www.perl.com') conn.request('GET','/cgi-bin/cpan_mod?module=DB_File&readme=1') r1 = conn.getresponse() content = r1.read() # POST method import urllib params = urllib.urlencode({'module': 'DB_File', 'readme': 1}) content = urllib.urlopen('http://www.perl.com', params).read() # fields must be properly escaped # script.cgi?field1?arg=%22this%20isn%27t%20%3CEASY%3E%22 # proxies can be taken from environment, or specified # as the optional thrid parameter to urlopen. |
# download the following standalone program #!/usr/bin/python # xurl - extract unique, sorted list of links from URL from HTMLParser import HTMLParser import urllib from sets import Set as set # not needed in 2.4 class myParser(HTMLParser): def __init__(self, url): self.baseUrl = url[:url.rfind('/')] HTMLParser.__init__(self) def reset(self): self.urls = set() HTMLParser.reset(self) def handle_starttag(self, tag, attrs): if tag == 'a': if attrs[0][0] == 'href': if attrs[0][1].find(':') == -1: # we need to add the base URL. self.urls.add(self.baseUrl + '/' + attrs[0][1]) else: self.urls.add(attrs[0][1]) url = 'http://www.perl.com/CPAN' p = myParser(url) s = urllib.urlopen(url) data = s.read() p.feed(data) urllist = p.urls._data.keys() urllist.sort() print '\n'.join(urllist) |
# Converting ASCII to HTML # download the following standalone program #!/usr/bin/python # text2html - trivial html encoding of normal text import sys import re # precompile regular expressions re_quoted = re.compile(r"(?m)^(>.*?)$") re_url = re.compile(r"<URL:(.*)>") re_http = re.compile(r"(http:\S+)") re_strong = re.compile(r"\*(\S+)\*") re_em = re.compile(r"\b_(\S+)_\b") # split paragraphs for para in open(sys.argv[1]).read().split("\n\n"): # TODO encode entities: dont encode "<>" but do "&" if para.startswith(" "): print "<pre>\n%s\n</pre>" % para else: para = re_quoted.sub(r"\1<br />", para) # quoted text para = re_url.sub(r'<a href="\1">\1</a>', para) # embedded URL para = re_http.sub(r'<a href="\1">\1</a>', para) # guessed URL para = re_strong.sub(r"<strong>\1</strong>",para) # this is *bold* here para = re_em.sub(r"<em>\1</em>",para) # this is _italic_ here print "<p>\n%s\n</p>" % para # add paragraph tags #----------------------------- import sys, re import htmlentitydefs def encode_entities(s): for k,v in htmlentitydefs.codepoint2name.items(): if k<256: # no unicodes s = s.replace(chr(k),"&%s;"%v) return s print "<table>" text = sys.stdin.read() text = encode_entities(text) text = re.sub(r"(\n[ \t]+)"," . ",text) # continuation lines text = re.sub(r"(?m)^(\S+?:)\s*(.*?)$", r'<tr><th align="left">\1</th><td>\2</td></tr>', text); print text print "</table>" |
# Converting HTML to ASCII #----------------------------- import os ascii = os.popen("lynx -dump " + filename).read() #----------------------------- import formatter import htmllib w = formatter.DumbWriter() f = formatter.AbstractFormatter(w) p = htmllib.HTMLParser(f) p.feed(html) p.close() # Above is a bare minimum to use writer/formatter/parser # framework of Python. # Search Python Cookbook for more details, like writing # your own writers or formatters. # Recipe #52297 has TtyFormatter, formatting underline # and bold in Terminal. Recipe #135005 has a writer # accumulating text instead of printing. |
import re plain_text = re.sub(r"<[^>]*>","",html_text) #WRONG # using HTMLParser import sys, HTMLParser class html(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self._plaintext = "" self._ignore = False def handle_starttag(self, tag, attrs): if tag == "script": self._ignore = True def handle_endtag(self, tag): if tag == "script": self._ignore = False def handle_data(self, data): if len(data)>0 and not self._ignore: self._plaintext += data def get_plaintext(self): return self._plaintext def error(self,msg): # ignore all errors pass html_text = open(sys.argv[1]).read() parser = html() parser.feed(html_text) parser.close() # force processing all data print parser.get_plaintext() title_s = re.search(r"(?i)<title>\s*(.*?)\s*</title>", text) title = title_s and title_s.groups()[0] or "NO TITLE" # download the following standalone program #!/usr/bin/python # htitlebytes - get html title from URL # import sys, urllib2, HTMLParser if len(sys.argv)<=1: print "usage: %s url ..." % sys.argv[0] sys.exit(1) # simple but pedantic html parser: tpj.com breaks it. class html(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self._data = {} self._open_tags = [] def handle_starttag(self, tag, attrs): self._open_tags.append(tag) def handle_endtag(self, tag): if len(self._open_tags)>0: self._open_tags.pop() def handle_data(self, data): if len(self._open_tags)>0: self._data[self._open_tags[-1]] = data def __getattr__(self,attr): if not self._data.has_key(attr): return "" return self._data[attr] def error(self,msg): # ignore all errors pass for url in sys.argv[1:]: print "%s: " % url, # TODO fake user agent "Schmozilla/v9.17 Platinum" # TODO referer "http://wizard.yellowbrick.oz" # as we only do http httplib would do also try: response = urllib2.urlopen(url) except: print " %s" % sys.exc_info()[1] sys.exit(1) # title is not in response parser = html() parser.feed(response.read()) parser.close() # force processing all data print parser.title |
# download the following standalone program #!/usr/bin/python # churl - check urls import sys # head request import urllib def valid(url): try: conn = urllib.urlopen(url) return 1 except: return 0 # parser class as in xurl from HTMLParser import HTMLParser from sets import Set as set # not needed in 2.4 class myParser(HTMLParser): def __init__(self, url): self.baseUrl = url[:url.rfind('/')] HTMLParser.__init__(self) def reset(self): self.urls = set() HTMLParser.reset(self) def handle_starttag(self, tag, attrs): if tag == 'a': if attrs[0][0] == 'href': if attrs[0][1].find(':') == -1: # we need to add the base URL. self.urls.add(self.baseUrl + '/' + attrs[0][1]) else: self.urls.add(attrs[0][1]) if len(sys.argv)<=1: print "usage: %s <start_url>" % (sys.argv[0]) sys.exit(1) base_url = sys.argv[1] print base_url+":" p = myParser(base_url) s = urllib.urlopen(base_url) data = s.read() p.feed(data) for link in p.urls._data.keys(): state = "UNKNOWN URL" if link.startswith("http:"): state = "BAD" if valid(link): state = "OK" print " %s: %s" % (link, state) |
# download the following standalone program #!/usr/bin/python # surl - sort URLs by their last modification date import urllib import time import sys Date = {} while 1: # we only read from stdin not from argv. ln = sys.stdin.readline() if not ln: break ln = ln.strip() try: u = urllib.urlopen(ln) date = time.mktime(u.info().getdate("date")) if not Date.has_key(date): Date[date] = [] Date[date].append(ln) except: sys.stderr.write("%s: %s!\n" % (ln, sys.exc_info()[1])) dates = Date.keys() dates.sort() # python 2.4 would have sorted for d in dates: print "%s %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(d)), ", ".join(Date[d])) |
import re def template(filename, fillings): text = open(filename).read() def repl(matchobj): if fillings.has_key(matchobj.group(1)): return str(fillings[matchobj.group(1)]) return "" # replace quoted words with value from fillings dictionary text = re.sub("%%(.+?)%%", repl, text) return text fields = { "username":"peter", "count":"23", "total": "1234"} print template("/home/httpd/templates/simple.template", fields) # download the following standalone program #!/usr/bin/python # userrep1 - report duration of user logins using SQL database import MySQLdb import cgi import re import sys def template(filename, fillings): text = open(filename).read() def repl(matchobj): if fillings.has_key(matchobj.group(1)): return str(fillings[matchobj.group(1)]) return "" # replace quoted words with value from fillings dictionary text = re.sub("%%(.+?)%%", repl, text) return text fields = cgi.FieldStorage() if not fields.has_key("user"): print "Content-Type: text/plain\n" print "No username" sys.exit(1) def get_userdata(username): db = MySQLdb.connect(passwd="",db="connections", user="bert") db.query("select count(duration) as count," +" sum(duration) as total from logins" +" where username='%s'" % username) res = db.store_result().fetch_row(maxrows=1,how=1) res[0]["username"] = username db.close() return res[0] print "Content-Type: text/html\n" print template("report.tpl", get_userdata(fields["user"].value)) # @@INCOMPLETE@@ |
# @@INCOMPLETE@@ # @@INCOMPLETE@@ |
# @@INCOMPLETE@@ # @@INCOMPLETE@@ |
# sample data, use ``LOGFILE = open(sys.argv[1])`` in real life LOGFILE = [ '127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303\n', '127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"\n', '192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228\n', '192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"\n', ] import re # similar too perl version. web_server_log_re = re.compile(r'^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$') # with group naming. split_re = re.compile(r'''(?x) # allow nicer formatting (but requires escaping blanks) ^(?P<client>\S+)\s (?P<identuser>\S+)\s (?P<authuser>\S+)\s \[ (?P<date>[^:]+): (?P<time>[\d:]+)\s (?P<tz>[^\]]+) \]\s " (?P<method>\S+)\s (?P<url>.*?)\s (?P<protocol>\S+) "\s (?P<status>\S+)\s (?P<bytes>\S+) (?: \s " (?P<referrer>[^"]+) "\s " (?P<agent>[^"]+) " )?''') for line in LOGFILE: f = split_re.match(line) if f: print "agent = %s" % f.groupdict()['agent'] |
# @@INCOMPLETE@@ # @@INCOMPLETE@@ |
# @@INCOMPLETE@@ # @@INCOMPLETE@@ |