PLEAC-Groovy
Prev		Next

20. Web Automation

Introduction

//----------------------------------------------------------------------------------
// Many packages are available for simulating a browser. A good starting point:
// http://groovy.codehaus.org/Testing+Web+Applications
//----------------------------------------------------------------------------------

Fetching a URL from a Perl Script

//----------------------------------------------------------------------------------
// for non-binary content
urlStr = 'http://groovy.codehaus.org'
content = new URL(urlStr).text
println content.size() // => 34824

// for binary content
urlStr = 'http://groovy.codehaus.org/download/attachments/1871/gina_3d.gif'
bytes = new ByteArrayOutputStream()
bytes << new URL(urlStr).openStream()
println bytes.size() // => 6066

// various forms of potential error checking
try {
    new URL('x:y:z')
} catch (MalformedURLException ex) {
    println ex.message // => unknown protocol: x
}
try {
    new URL('cnn.com/not.there')
} catch (MalformedURLException ex) {
    println ex.message // => no protocol: cnn.com/not.there
}
try {
    content = new URL('http://cnn.com/not.there').text
} catch (FileNotFoundException ex) {
    println "Couldn't find: " + ex.message
    // => Couldn't find: http://www.cnn.com/not.there
}

// titleBytes example
def titleBytes(urlStr) {
    def lineCount = 0; def byteCount = 0
    new URL(urlStr).eachLine{ line ->
        lineCount++; byteCount += line.size()
    }
    println "$urlStr => ($lineCount lines, $byteCount bytes)"
}
titleBytes('http://www.tpj.com/')
// http://www.tpj.com/ => (677 lines, 25503 bytes)
//----------------------------------------------------------------------------------

Automating Form Submission

//----------------------------------------------------------------------------------
// using HtmlUnit (htmlunit.sf.net)
import com.gargoylesoftware.htmlunit.WebClient

def webClient = new WebClient()
def page = webClient.getPage('http://search.cpan.org/')
// check page title
assert page.titleText.startsWith('The CPAN Search Site')
// fill in form and submit it
def form = page.getFormByName('f')
def field = form.getInputByName('query')
field.setValueAttribute('DB_File')
def button = form.getInputByValue('CPAN Search')
def result = button.click()
// check search result has at least one link ending in DB_File.pm
assert result.anchors.any{ a -> a.hrefAttribute.endsWith('DB_File.pm') }

// fields must be properly escaped
println URLEncoder.encode(/"this isn't <EASY>&<FUN>"/, 'utf-8')
// => %22this+isn%27t+%3CEASY%3E%26%3CFUN%3E%22

// proxies can be taken from environment, or specified
//System.properties.putAll( ["http.proxyHost":"proxy-host", "http.proxyPort":"proxy-port",
//    "http.proxyUserName":"user-name", "http.proxyPassword":"proxy-passwd"] )
//----------------------------------------------------------------------------------

Extracting URLs

//----------------------------------------------------------------------------------
// using HtmlUnit (htmlunit.sf.net)
import com.gargoylesoftware.htmlunit.WebClient

client = new WebClient()
html = client.getPage('http://www.perl.com/CPAN/')
println page.anchors.collect{ it.hrefAttribute }.sort().unique().join('\n')
// =>
// disclaimer.html
// http://bookmarks.cpan.org/
// http://faq.perl.org/
// mailto:cpan@perl.org
// ...
//----------------------------------------------------------------------------------

Converting ASCII to HTML

//----------------------------------------------------------------------------------
// split paragraphs
LS = System.properties.'line.separator'
new File(args[0]).text.split("$LS$LS").each{ para ->
    if (para.startsWith(" ")) println "<pre>\n$para\n</pre>"
    else {
        para = para.replaceAll(/(?m)^(>.*?)$/, /$1<br \/>/)            // quoted text
        para = para.replaceAll(/<URL:(.*)>/, /<a href="$1">$1<\/a>/)   // embedded URL
        para = para.replaceAll(/(http:\S+)/, /<a href="$1">$1<\/a>/)   // guessed URL
        para = para.replaceAll('\\*(\\S+)\\*', /<strong>$1<\/strong>/) // this is *bold* here
        para = para.replaceAll(/\b_(\S+)_\b/, /<em>$1<\/em>/)          // this is _italic_ here
        println "<p>\n$para\n</p>"                                     // add paragraph tags
    }
}

def encodeEmail(email) {
    println "<table>"
    email = URLEncoder.encode(email)
    email = text.replaceAll(/(\n[ \t]+)/, / . /)   // continuation lines
    email = text.replaceAll(/(?m)^(\S+?:)\s*(.*?)$/,
                  /<tr><th align="left">$1<\/th><td>$2<\/td><\/tr>/);
    println email
    println "</table>"
}
//----------------------------------------------------------------------------------

Converting HTML to ASCII

//----------------------------------------------------------------------------------
// using CyberNeko Parser (people.apache.org/~andyc/neko/doc)
parser = new org.cyberneko.html.parsers.SAXParser()
parser.setFeature('http://xml.org/sax/features/namespaces', false)
page = new XmlParser(parser).parse('http://www.perl.com/CPAN/')
page.depthFirst().each{ println it.text() }
//----------------------------------------------------------------------------------

Extracting or Removing HTML Tags

//----------------------------------------------------------------------------------
// removing tags, see 20.5

// extracting tags: htitle using cyberneko and XmlSlurper
parser = new org.cyberneko.html.parsers.SAXParser()
parser.setFeature('http://xml.org/sax/features/namespaces', false)
page = new XmlParser(parser).parse('http://www.perl.com/CPAN/')
println page.HEAD.TITLE[0].text()

// extracting tags: htitle using HtmlUnit
client = new WebClient()
html = client.getPage('http://www.perl.com/CPAN/')
println html.titleText
//----------------------------------------------------------------------------------

Finding Stale Links

//----------------------------------------------------------------------------------
import com.gargoylesoftware.htmlunit.WebClient

client = new WebClient()
page = client.getPage('http://www.perl.com/CPAN/')
page.anchors.each{
    checkUrl(page, it.hrefAttribute)
}

def checkUrl(page, url) {
    try {
        print "$url "
        qurl = page.getFullyQualifiedUrl(url)
        client.getPage(qurl)
        println 'OK'
    } catch (Exception ex) {
        println 'BAD'
    }
}
// =>
// modules/index.html OK
// RECENT.html OK
// http://search.cpan.org/recent OK
// http://mirrors.cpan.org/ OK
// http://perldoc.perl.org/ OK
// mailto:cpan@perl.org BAD
// http://www.csc.fi/suomi/funet/verkko.html.en/ BAD
// ...
//----------------------------------------------------------------------------------

Finding Fresh Links

//----------------------------------------------------------------------------------
import org.apache.commons.httpclient.HttpClient
import org.apache.commons.httpclient.methods.HeadMethod
import java.text.DateFormat

urls = [
    "http://www.apache.org/",
    "http://www.perl.org/",
    "http://www.python.org/",
    "http://www.ora.com/",
    "http://jakarta.apache.org/",
    "http://www.w3.org/"
]

df = DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.MEDIUM)
client = new HttpClient()
urlInfo = [:]
urls.each{ url ->
    head = new HeadMethod(url)
    client.executeMethod(head)
    lastModified = head.getResponseHeader("last-modified")?.value
    urlInfo[df.parse(lastModified)]=url
}

urlInfo.keySet().sort().each{ key ->
    println "$key ${urlInfo[key]}"
}
// =>
// Sun Jan 07 21:48:15 EST 2007 http://www.apache.org/
// Sat Jan 13 12:44:32 EST 2007 http://jakarta.apache.org/
// Fri Jan 19 14:50:13 EST 2007 http://www.w3.org/
// Fri Jan 19 19:28:35 EST 2007 http://www.python.org/
// Sat Jan 20 09:36:08 EST 2007 http://www.ora.com/
// Sat Jan 20 13:25:53 EST 2007 http://www.perl.org/
//----------------------------------------------------------------------------------

Creating HTML Templates

//----------------------------------------------------------------------------------
// GString version (variables must be predefined):
username = 'Tom'
count = 99
total = 999
htmlStr = """
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for $username</TITLE></HEAD>
<BODY><H1>Report for $username</H1>
$username logged in $count times, for a total of $total minutes.
"""
println htmlStr

// SimpleTemplateEngine version:
def html = '''
<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for $username</TITLE></HEAD>
<BODY><H1>Report for $username</H1>
$username logged in $count times, for a total of $total minutes.
'''

def engine = new groovy.text.SimpleTemplateEngine()
def reader = new StringReader(html)
def template = engine.createTemplate(reader)
println template.make(username:"Peter", count:"23", total: "1234")

// SQL version
import groovy.sql.Sql
user = 'Peter'
def sql = Sql.newInstance('jdbc:mysql://localhost:3306/mydb', 'dbuser',
                      'dbpass', 'com.mysql.jdbc.Driver')
sql.query("SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='$user'") { answer ->
    println (template.make(username:user, count:answer[0], total:answer[1]))
}
//----------------------------------------------------------------------------------

Mirroring Web Pages

//----------------------------------------------------------------------------------
// using built-in connection features
urlStr = 'http://jakarta.apache.org/'
url = new URL(urlStr)
connection = url.openConnection()
connection.ifModifiedSince = new Date(2007,1,18).time
connection.connect()
println connection.responseCode

// manually setting header field
connection = url.openConnection()
df = new java.text.SimpleDateFormat ("EEE, dd MMM yyyy HH:mm:ss 'GMT'")
df.setTimeZone(TimeZone.getTimeZone('GMT'))
connection.setRequestProperty("If-Modified-Since",df.format(new Date(2007,1,18)));
connection.connect()
println connection.responseCode
//----------------------------------------------------------------------------------

Creating a Robot

//----------------------------------------------------------------------------------
// The website http://www.robotstxt.org/wc/active/html/ lists many available robots
// including Java ones which can be used from Groovy. In particular, j-spider
// allows you to:
// + Check your site for errors (internal server errors, ...)
// + Outgoing and/or internal link checking
// + Analyze your site structure (creating a sitemap, ...)
// + Download complete web sites
// most of its functionality is available by tweaking appropriate configuration
// files and then running it as a standalone application but you can also write
// your own java classes.
//----------------------------------------------------------------------------------

Parsing a Web Server Log File

//----------------------------------------------------------------------------------
// sample data, use 'LOGFILE = new File(args[0]).text' or similar
LOGFILE = '''
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"
192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228
192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"
'''

// similar to perl version:
fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes']
regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/

LOGFILE.trim().split('\n').each{ line ->
    m = line =~ regex
    if (m.matches()) {
        for (idx in 0..<fields.size()) { println "${fields[idx]}=${m[0][idx+1]}" }
        println()
    }
}
//----------------------------------------------------------------------------------

Processing Server Logs

//----------------------------------------------------------------------------------
// sample data, use 'LOGFILE = new File(args[0]).text' or similar
LOGFILE = '''
204.31.113.138 - - [03/Jul/1996:06:56:12 -0800] "POST /forms/login.jsp HTTP/1.0" 200 5593
fcrawler.looksmart.com - - [26/Apr/2000:00:00:12 -0400] "GET /contacts.html HTTP/1.0" 200 4595 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)"
fcrawler.looksmart.com - - [26/Apr/2000:00:17:19 -0400] "GET /news/news.html HTTP/1.0" 200 16716 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)"
ppp931.on.bellglobal.com - - [26/Apr/2000:00:16:12 -0400] "GET /download/windows/asctab31.zip HTTP/1.0" 200 1540096 "http://www.htmlgoodies.com/downloads/freeware/webdevelopment/15.html" "Mozilla/4.7 [en]C-SYMPA  (Win95; U)"
123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/wpaper.gif HTTP/1.0" 200 6248 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [26/Apr/2000:00:23:47 -0400] "GET /asctortf/ HTTP/1.0" 200 8130 "http://search.netscape.com/Computers/Data_Formats/Document/Text/RTF" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/5star2000.gif HTTP/1.0" 200 4005 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:50 -0400] "GET /pics/5star.gif HTTP/1.0" 200 1031 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /pics/a2hlogo.jpg HTTP/1.0" 200 4282 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /cgi-bin/newcount?jafsof3&width=4&font=digital&noshow HTTP/1.0" 200 36 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)"
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET / HTTP/1.1" 200 1927
127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)"
192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228
192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)"
'''

fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes']
regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/

class Summary {
    def hosts = [:]
    def what = [:]
    def accessCount = 0
    def postCount = 0
    def homeCount = 0
    def totalBytes = 0
}
totals = [:]
LOGFILE.trim().split('\n').each{ line ->
    m = line =~ regex
    if (m.matches()) {
        date = m[0][fields.indexOf('date')+1]
        s = totals.get(date, new Summary())
        s.accessCount++
        if (m[0][fields.indexOf('method')+1] == 'POST') s.postCount++
        s.totalBytes += (m[0][fields.indexOf('bytes')+1]).toInteger()
        def url = m[0][fields.indexOf('url')+1]
        if (url == '/') s.homeCount++
        s.what[url] = s.what.get(url, 0) + 1
        def host = m[0][fields.indexOf('client')+1]
        s.hosts[host] = s.hosts.get(host, 0) + 1
    }
}
report('Date','Hosts','Accesses','Unidocs','POST','Home','Bytes')
totals.each{ key, s ->
    report(key, s.hosts.size(), s.accessCount, s.what.size(), s.postCount, s.homeCount, s.totalBytes)
}
v = totals.values()
report('Grand Total', v.sum{it.hosts.size()}, v.sum{it.accessCount}, v.sum{it.what.size()},
        v.sum{it.postCount}, v.sum{it.homeCount}, v.sum{it.totalBytes} )

def report(a, b, c, d, e, f, g) {
    printf ("%12s %6s %8s %8s %8s %8s %10s\n", [a,b,c,d,e,f,g])
}
// =>
//         Date  Hosts Accesses  Unidocs     POST     Home      Bytes
//  03/Jul/1996      1        1        1        1        0       5593
//  10/Oct/2000      1        1        1        0        0       2326
//  04/Sep/2005      1        2        2        0        1       2230
//  05/Sep/2005      1        2        1        0        0      12456
//  26/Apr/2000      3        6        6        0        0    1579790
//  27/Apr/2000      1        3        3        0        0       5349
//  Grand Total      8       15       14        1        1    1607744


// Some open source log processing packages in Java:
// http://www.generationjava.com/projects/logview/index.shtml
// http://ostermiller.org/webalizer/
// http://jxla.nvdcms.org/en/index.xml
// http://polliwog.sourceforge.net/index.html
// as well as textual reports, most of these can produce graphical reports
// Most have their own configuration information and Java extension points.
//----------------------------------------------------------------------------------

Program: htmlsub

//----------------------------------------------------------------------------------
 import org.cyberneko.html.filters.Writer
 import org.cyberneko.html.filters.DefaultFilter
 import org.apache.xerces.xni.parser.XMLDocumentFilter
 import org.apache.xerces.xni.*
 import org.cyberneko.html.parsers.DOMParser
 import org.xml.sax.InputSource

 input = '''
 <HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY>
 <H1>Welcome to Scooby World!</H1>
 I have <A HREF="pictures.html">pictures</A> of the crazy dog
 himself. Here's one!<P>
 <IMG SRC="scooby.jpg" ALT="Good doggy!"><P>
 <BLINK>He's my hero!</BLINK> I would like to meet him some day,
 and get my picture taken with him.<P>
 P.S. I am deathly ill. <A HREF="shergold.html">Please send
 cards</A>.
 </BODY></HTML>
 '''

 class WordReplaceFilter extends DefaultFilter {
     private before, after
     WordReplaceFilter(b, a) { before = b; after = a }
     void characters(XMLString text, Augmentations augs) {
         char[] c = text.toString().replaceAll(before, after)
         super.characters(new XMLString(c, 0, c.size()), augs)
     }
     void setProperty(String s, Object o){}
 }
 XMLDocumentFilter[] filters = [
     new WordReplaceFilter(/(?sm)picture/, /photo/),
     new Writer()
 ]
 parser = new DOMParser()
 parser.setProperty("http://cyberneko.org/html/properties/filters", filters)
 parser.parse(new InputSource(new StringReader(input)))
//----------------------------------------------------------------------------------

Program: hrefsub

//----------------------------------------------------------------------------------
import org.cyberneko.html.filters.Writer
import org.cyberneko.html.filters.DefaultFilter
import org.apache.xerces.xni.parser.XMLDocumentFilter
import org.apache.xerces.xni.*
import org.cyberneko.html.parsers.DOMParser
import org.xml.sax.InputSource

input = '''
<HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY>
<H1>Welcome to Scooby World!</H1>
I have <A HREF="pictures.html">pictures</A> of the crazy dog
himself. Here's one!<P>
<IMG SRC="scooby.jpg" ALT="Good doggy!"><P>
<BLINK>He's my hero!</BLINK> I would like to meet him some day,
and get my picture taken with him.<P>
P.S. I am deathly ill. <A HREF="shergold.html">Please send
cards</A>.
</BODY></HTML>
'''

class HrefReplaceFilter extends DefaultFilter {
    private before, after
    HrefReplaceFilter(b, a) { before = b; after = a }
    void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
        def idx = attributes.getIndex('href')
        if (idx != -1) {
            def newtext = attributes.getValue(idx).replaceAll(before, after)
            attributes.setValue(idx, URLEncoder.encode(newtext))
        }
        super.startElement(element, attributes, augs)
    }
    void setProperty(String s, Object o){}
}
XMLDocumentFilter[] myfilters = [
    new HrefReplaceFilter(/shergold.html/, /cards.html/),
    new Writer()
]
parser = new DOMParser()
parser.setProperty("http://cyberneko.org/html/properties/filters", myfilters)
parser.parse(new InputSource(new StringReader(input)))
//----------------------------------------------------------------------------------

Prev	Home	Next
CGI Programming		Helpers