20. Web Automation

Introduction

Fetching a URL from a Perl Script

#-----------------------------
import urllib
content = urllib.urlopen(url).read()

try:
    import urllib
    content = urllib.urlopen(url).read()
except IOError:
    print "could not get %s" % url

#-----------------------------
# download the following standalone program
#!/usr/bin/python
# titlebytes - find the title and size of documents
#
# differences to perl
# 
# * no URI::Heuristics
# * perl LWP supports fetching files from local system
# * fetching a title from ftp or file doesnt work in perl either.

import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
    print "usage: %s url" % sys.argv[0]
    sys.exit(1)
raw_url = sys.argv[1] 

# python has no equivalent to pearls URI::Heuristics, which
# would do some guessing like :
#
#   perl            -> http://www.perl.com
#   www.oreilly.com -> http://www.oreilly.com
#   ftp.funet.fi    -> ftp://ftp.funet.fi
#   /etc/passwd     -> file:/etc/passwd

# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._data = {}
        self._open_tags = []
    def handle_starttag(self, tag, attrs):
        self._open_tags.append(tag)
    def handle_endtag(self, tag):
        if len(self._open_tags)>0:
            self._open_tags.pop()
    def handle_data(self, data):
        if len(self._open_tags)>0:
            self._data[self._open_tags[-1]] = data
    def __getattr__(self,attr):
        if not self._data.has_key(attr):
            return ""
        return self._data[attr]

url = raw_url
print "%s =>\n\t" % url,
# TODO fake user agent "Schmozilla/v9.17 Platinum"
# TODO referer "http://wizard.yellowbrick.oz"
# as we only do http httplib would do also
try:
        response = urllib2.urlopen(url)
except:
        print " %s" % sys.exc_info()[1].reason[1]
        sys.exit(1)
# title is not in response
data = response.read()
parser = html()
parser.feed(data)
parser.close()  # force processing all data
count = len(data.split("\n"))
bytes = len(data)
print "%s (%d lines, %d bytes)" % (parser.title, 
        count, 
        bytes)

# omly bytes is in response.info()

Automating Form Submission


# GET method
import httplib
conn = httplib.HTTPConnection('www.perl.com')
conn.request('GET','/cgi-bin/cpan_mod?module=DB_File&readme=1')
r1 = conn.getresponse()
content = r1.read()

# POST method
import urllib
params = urllib.urlencode({'module': 'DB_File', 'readme': 1})
content = urllib.urlopen('http://www.perl.com', params).read()

# fields must be properly escaped
# script.cgi?field1?arg=%22this%20isn%27t%20%3CEASY%3E%22

# proxies can be taken from environment, or specified
# as the optional thrid parameter to urlopen.

Extracting URLs

# download the following standalone program
#!/usr/bin/python
# xurl - extract unique, sorted list of links from URL

from HTMLParser import HTMLParser
import urllib
from sets import Set as set # not needed in 2.4
class myParser(HTMLParser):
    def __init__(self, url):
        self.baseUrl = url[:url.rfind('/')]
        HTMLParser.__init__(self)
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if attrs[0][0] == 'href':
                if attrs[0][1].find(':') == -1:
                    # we need to add the base URL.
                    self.urls.add(self.baseUrl + '/' + attrs[0][1])
                else:
                    self.urls.add(attrs[0][1])
url = 'http://www.perl.com/CPAN'
p = myParser(url)
s = urllib.urlopen(url)
data = s.read()
p.feed(data)
urllist = p.urls._data.keys()
urllist.sort()
print '\n'.join(urllist)

Converting ASCII to HTML

# Converting ASCII to HTML

# download the following standalone program
#!/usr/bin/python
# text2html - trivial html encoding of normal text

import sys
import re

# precompile regular expressions
re_quoted = re.compile(r"(?m)^(>.*?)$")
re_url = re.compile(r"<URL:(.*)>")
re_http = re.compile(r"(http:\S+)")
re_strong = re.compile(r"\*(\S+)\*")
re_em = re.compile(r"\b_(\S+)_\b")

# split paragraphs
for para in open(sys.argv[1]).read().split("\n\n"):
    # TODO encode entities: dont encode "<>" but do "&"
    if para.startswith(" "):
        print "<pre>\n%s\n</pre>" % para
    else:
        para = re_quoted.sub(r"\1<br />", para)          # quoted text
        para = re_url.sub(r'<a href="\1">\1</a>', para)  # embedded URL
        para = re_http.sub(r'<a href="\1">\1</a>', para) # guessed URL
        para = re_strong.sub(r"<strong>\1</strong>",para)   # this is *bold* here
        para = re_em.sub(r"<em>\1</em>",para)            # this is _italic_ here
        print "<p>\n%s\n</p>" % para                     # add paragraph tags



#-----------------------------
import sys, re
import htmlentitydefs

def encode_entities(s):
    for k,v in htmlentitydefs.codepoint2name.items():
        if k<256: # no unicodes
            s = s.replace(chr(k),"&%s;"%v)
    return s

print "<table>"
text = sys.stdin.read()
text = encode_entities(text)
text = re.sub(r"(\n[ \t]+)"," . ",text)   # continuation lines
text = re.sub(r"(?m)^(\S+?:)\s*(.*?)$",
              r'<tr><th align="left">\1</th><td>\2</td></tr>',
                            text);
print text    
print "</table>"
                            

Converting HTML to ASCII

# Converting HTML to ASCII

#-----------------------------
import os
ascii = os.popen("lynx -dump " + filename).read()

#-----------------------------
import formatter
import htmllib

w = formatter.DumbWriter()
f = formatter.AbstractFormatter(w)
p = htmllib.HTMLParser(f)
p.feed(html)
p.close()

# Above is a bare minimum to use writer/formatter/parser
# framework of Python.

# Search Python Cookbook for more details, like writing
# your own writers or formatters.

# Recipe #52297 has TtyFormatter, formatting underline
# and bold in Terminal. Recipe #135005 has a writer
# accumulating text instead of printing.

Extracting or Removing HTML Tags


import re

plain_text = re.sub(r"<[^>]*>","",html_text) #WRONG

# using HTMLParser
import sys, HTMLParser

class html(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._plaintext = ""
        self._ignore = False
    def handle_starttag(self, tag, attrs):
        if tag == "script":
            self._ignore = True
    def handle_endtag(self, tag):
        if tag == "script":
            self._ignore = False
    def handle_data(self, data):
        if len(data)>0 and not self._ignore:
            self._plaintext += data
    def get_plaintext(self):
        return self._plaintext
    def error(self,msg):
        # ignore all errors
        pass

html_text = open(sys.argv[1]).read()

parser = html()
parser.feed(html_text)
parser.close()  # force processing all data
print parser.get_plaintext()

title_s = re.search(r"(?i)<title>\s*(.*?)\s*</title>", text)
title = title_s and title_s.groups()[0] or "NO TITLE"

# download the following standalone program
#!/usr/bin/python
# htitlebytes - get html title from URL
#

import sys, urllib2, HTMLParser
if len(sys.argv)<=1:
    print "usage: %s url ..." % sys.argv[0]
    sys.exit(1)

# simple but pedantic html parser: tpj.com breaks it.
class html(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._data = {}
        self._open_tags = []
    def handle_starttag(self, tag, attrs):
        self._open_tags.append(tag)
    def handle_endtag(self, tag):
        if len(self._open_tags)>0:
            self._open_tags.pop()
    def handle_data(self, data):
        if len(self._open_tags)>0:
            self._data[self._open_tags[-1]] = data
    def __getattr__(self,attr):
        if not self._data.has_key(attr):
            return ""
        return self._data[attr]
    def error(self,msg):
        # ignore all errors
        pass

for url in sys.argv[1:]:
    print "%s: " % url,
    # TODO fake user agent "Schmozilla/v9.17 Platinum"
    # TODO referer "http://wizard.yellowbrick.oz"
    # as we only do http httplib would do also
    try:
        response = urllib2.urlopen(url)
    except:
        print " %s" % sys.exc_info()[1]
        sys.exit(1)
    # title is not in response
    parser = html()
    parser.feed(response.read())
    parser.close()  # force processing all data
    print parser.title 

Finding Stale Links

# download the following standalone program
#!/usr/bin/python
# churl - check urls

import sys

# head request
import urllib
def valid(url):
    try:
        conn = urllib.urlopen(url)
        return 1
    except:
        return 0

# parser class as in xurl
from HTMLParser import HTMLParser
from sets import Set as set # not needed in 2.4
class myParser(HTMLParser):
    def __init__(self, url):
        self.baseUrl = url[:url.rfind('/')]
        HTMLParser.__init__(self)
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if attrs[0][0] == 'href':
                if attrs[0][1].find(':') == -1:
                    # we need to add the base URL.
                    self.urls.add(self.baseUrl + '/' + attrs[0][1])
                else:
                    self.urls.add(attrs[0][1])

if len(sys.argv)<=1:
    print "usage: %s <start_url>" % (sys.argv[0])
    sys.exit(1)
    
base_url = sys.argv[1]
print base_url+":"
p = myParser(base_url)
s = urllib.urlopen(base_url)
data = s.read()
p.feed(data)
for link in p.urls._data.keys():
    state = "UNKNOWN URL"
    if link.startswith("http:"):
        state = "BAD"
        if valid(link):
            state = "OK"
    print "  %s: %s" % (link, state)

Finding Fresh Links

# download the following standalone program
#!/usr/bin/python
# surl - sort URLs by their last modification date

import urllib
import time
import sys

Date = {}
while 1:
    # we only read from stdin not from argv.
    ln = sys.stdin.readline()
    if not ln:
        break
    ln = ln.strip()
    try:
        u = urllib.urlopen(ln)
        date = time.mktime(u.info().getdate("date"))
        if not Date.has_key(date):
            Date[date] = []
        Date[date].append(ln)
    except:
        sys.stderr.write("%s: %s!\n" % (ln, sys.exc_info()[1]))

dates = Date.keys()
dates.sort()    # python 2.4 would have sorted
for d in dates:
    print "%s  %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(d)),
                    ", ".join(Date[d]))

Creating HTML Templates

import re

def template(filename, fillings):
    text = open(filename).read()
    def repl(matchobj):
        if fillings.has_key(matchobj.group(1)):
            return str(fillings[matchobj.group(1)])
        return ""
    # replace quoted words with value from fillings dictionary
    text = re.sub("%%(.+?)%%", repl, text)
    return text

fields = { "username":"peter", "count":"23", "total": "1234"}
print template("/home/httpd/templates/simple.template", fields)

# download the following standalone program
#!/usr/bin/python
# userrep1 - report duration of user logins using SQL database

import MySQLdb
import cgi
import re
import sys

def template(filename, fillings):
    text = open(filename).read()
    def repl(matchobj):
        if fillings.has_key(matchobj.group(1)):
            return str(fillings[matchobj.group(1)])
        return ""
    # replace quoted words with value from fillings dictionary
    text = re.sub("%%(.+?)%%", repl, text)
    return text

fields = cgi.FieldStorage()
if not fields.has_key("user"):
    print "Content-Type: text/plain\n"
    print "No username"
    sys.exit(1)

def get_userdata(username):
    db = MySQLdb.connect(passwd="",db="connections", user="bert")
    db.query("select count(duration) as count,"
            +" sum(duration) as total from logins"
            +" where username='%s'" % username)
    res = db.store_result().fetch_row(maxrows=1,how=1)
    res[0]["username"] = username
    db.close()
    return res[0]
                        
print "Content-Type: text/html\n"

print template("report.tpl", get_userdata(fields["user"].value))

# @@INCOMPLETE@@

Mirroring Web Pages

# @@INCOMPLETE@@
# @@INCOMPLETE@@

Creating a Robot

# @@INCOMPLETE@@
# @@INCOMPLETE@@

Parsing a Web Server Log File


# sample data, use ``LOGFILE = open(sys.argv[1])`` in real life
LOGFILE = [
        '127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303\n',
        '127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"\n',
        '192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228\n',
        '192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"\n',
    ]

import re

# similar too perl version.
web_server_log_re = re.compile(r'^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)$')
    
# with group naming.
split_re = re.compile(r'''(?x)         # allow nicer formatting (but requires escaping blanks)
                       ^(?P<client>\S+)\s
                       (?P<identuser>\S+)\s
                       (?P<authuser>\S+)\s
                       \[
                         (?P<date>[^:]+):
                         (?P<time>[\d:]+)\s
                         (?P<tz>[^\]]+)
                       \]\s
                       "
                         (?P<method>\S+)\s
                         (?P<url>.*?)\s
                         (?P<protocol>\S+)
                       "\s
                       (?P<status>\S+)\s
                       (?P<bytes>\S+)
                       (?:
                         \s
                         "
                           (?P<referrer>[^"]+)
                         "\s
                         "
                           (?P<agent>[^"]+)
                         "
                       )?''')
for line in LOGFILE:
    f = split_re.match(line)
    if f:
        print "agent = %s" % f.groupdict()['agent']

Processing Server Logs

# @@INCOMPLETE@@
# @@INCOMPLETE@@

Program: htmlsub

# @@INCOMPLETE@@
# @@INCOMPLETE@@

Program: hrefsub