//----------------------------------------------------------------------------------
// Many packages are available for simulating a browser. A good starting point:
// http://groovy.codehaus.org/Testing+Web+Applications
//----------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------- // for non-binary content urlStr = 'http://groovy.codehaus.org' content = new URL(urlStr).text println content.size() // => 34824 // for binary content urlStr = 'http://groovy.codehaus.org/download/attachments/1871/gina_3d.gif' bytes = new ByteArrayOutputStream() bytes << new URL(urlStr).openStream() println bytes.size() // => 6066 // various forms of potential error checking try { new URL('x:y:z') } catch (MalformedURLException ex) { println ex.message // => unknown protocol: x } try { new URL('cnn.com/not.there') } catch (MalformedURLException ex) { println ex.message // => no protocol: cnn.com/not.there } try { content = new URL('http://cnn.com/not.there').text } catch (FileNotFoundException ex) { println "Couldn't find: " + ex.message // => Couldn't find: http://www.cnn.com/not.there } // titleBytes example def titleBytes(urlStr) { def lineCount = 0; def byteCount = 0 new URL(urlStr).eachLine{ line -> lineCount++; byteCount += line.size() } println "$urlStr => ($lineCount lines, $byteCount bytes)" } titleBytes('http://www.tpj.com/') // http://www.tpj.com/ => (677 lines, 25503 bytes) //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // using HtmlUnit (htmlunit.sf.net) import com.gargoylesoftware.htmlunit.WebClient def webClient = new WebClient() def page = webClient.getPage('http://search.cpan.org/') // check page title assert page.titleText.startsWith('The CPAN Search Site') // fill in form and submit it def form = page.getFormByName('f') def field = form.getInputByName('query') field.setValueAttribute('DB_File') def button = form.getInputByValue('CPAN Search') def result = button.click() // check search result has at least one link ending in DB_File.pm assert result.anchors.any{ a -> a.hrefAttribute.endsWith('DB_File.pm') } // fields must be properly escaped println URLEncoder.encode(/"this isn't <EASY>&<FUN>"/, 'utf-8') // => %22this+isn%27t+%3CEASY%3E%26%3CFUN%3E%22 // proxies can be taken from environment, or specified //System.properties.putAll( ["http.proxyHost":"proxy-host", "http.proxyPort":"proxy-port", // "http.proxyUserName":"user-name", "http.proxyPassword":"proxy-passwd"] ) //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // using HtmlUnit (htmlunit.sf.net) import com.gargoylesoftware.htmlunit.WebClient client = new WebClient() html = client.getPage('http://www.perl.com/CPAN/') println page.anchors.collect{ it.hrefAttribute }.sort().unique().join('\n') // => // disclaimer.html // http://bookmarks.cpan.org/ // http://faq.perl.org/ // mailto:cpan@perl.org // ... //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // split paragraphs LS = System.properties.'line.separator' new File(args[0]).text.split("$LS$LS").each{ para -> if (para.startsWith(" ")) println "<pre>\n$para\n</pre>" else { para = para.replaceAll(/(?m)^(>.*?)$/, /$1<br \/>/) // quoted text para = para.replaceAll(/<URL:(.*)>/, /<a href="$1">$1<\/a>/) // embedded URL para = para.replaceAll(/(http:\S+)/, /<a href="$1">$1<\/a>/) // guessed URL para = para.replaceAll('\\*(\\S+)\\*', /<strong>$1<\/strong>/) // this is *bold* here para = para.replaceAll(/\b_(\S+)_\b/, /<em>$1<\/em>/) // this is _italic_ here println "<p>\n$para\n</p>" // add paragraph tags } } def encodeEmail(email) { println "<table>" email = URLEncoder.encode(email) email = text.replaceAll(/(\n[ \t]+)/, / . /) // continuation lines email = text.replaceAll(/(?m)^(\S+?:)\s*(.*?)$/, /<tr><th align="left">$1<\/th><td>$2<\/td><\/tr>/); println email println "</table>" } //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // using CyberNeko Parser (people.apache.org/~andyc/neko/doc) parser = new org.cyberneko.html.parsers.SAXParser() parser.setFeature('http://xml.org/sax/features/namespaces', false) page = new XmlParser(parser).parse('http://www.perl.com/CPAN/') page.depthFirst().each{ println it.text() } //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // removing tags, see 20.5 // extracting tags: htitle using cyberneko and XmlSlurper parser = new org.cyberneko.html.parsers.SAXParser() parser.setFeature('http://xml.org/sax/features/namespaces', false) page = new XmlParser(parser).parse('http://www.perl.com/CPAN/') println page.HEAD.TITLE[0].text() // extracting tags: htitle using HtmlUnit client = new WebClient() html = client.getPage('http://www.perl.com/CPAN/') println html.titleText //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- import com.gargoylesoftware.htmlunit.WebClient client = new WebClient() page = client.getPage('http://www.perl.com/CPAN/') page.anchors.each{ checkUrl(page, it.hrefAttribute) } def checkUrl(page, url) { try { print "$url " qurl = page.getFullyQualifiedUrl(url) client.getPage(qurl) println 'OK' } catch (Exception ex) { println 'BAD' } } // => // modules/index.html OK // RECENT.html OK // http://search.cpan.org/recent OK // http://mirrors.cpan.org/ OK // http://perldoc.perl.org/ OK // mailto:cpan@perl.org BAD // http://www.csc.fi/suomi/funet/verkko.html.en/ BAD // ... //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- import org.apache.commons.httpclient.HttpClient import org.apache.commons.httpclient.methods.HeadMethod import java.text.DateFormat urls = [ "http://www.apache.org/", "http://www.perl.org/", "http://www.python.org/", "http://www.ora.com/", "http://jakarta.apache.org/", "http://www.w3.org/" ] df = DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.MEDIUM) client = new HttpClient() urlInfo = [:] urls.each{ url -> head = new HeadMethod(url) client.executeMethod(head) lastModified = head.getResponseHeader("last-modified")?.value urlInfo[df.parse(lastModified)]=url } urlInfo.keySet().sort().each{ key -> println "$key ${urlInfo[key]}" } // => // Sun Jan 07 21:48:15 EST 2007 http://www.apache.org/ // Sat Jan 13 12:44:32 EST 2007 http://jakarta.apache.org/ // Fri Jan 19 14:50:13 EST 2007 http://www.w3.org/ // Fri Jan 19 19:28:35 EST 2007 http://www.python.org/ // Sat Jan 20 09:36:08 EST 2007 http://www.ora.com/ // Sat Jan 20 13:25:53 EST 2007 http://www.perl.org/ //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // GString version (variables must be predefined): username = 'Tom' count = 99 total = 999 htmlStr = """ <!-- simple.template for internal template() function --> <HTML><HEAD><TITLE>Report for $username</TITLE></HEAD> <BODY><H1>Report for $username</H1> $username logged in $count times, for a total of $total minutes. """ println htmlStr // SimpleTemplateEngine version: def html = ''' <!-- simple.template for internal template() function --> <HTML><HEAD><TITLE>Report for $username</TITLE></HEAD> <BODY><H1>Report for $username</H1> $username logged in $count times, for a total of $total minutes. ''' def engine = new groovy.text.SimpleTemplateEngine() def reader = new StringReader(html) def template = engine.createTemplate(reader) println template.make(username:"Peter", count:"23", total: "1234") // SQL version import groovy.sql.Sql user = 'Peter' def sql = Sql.newInstance('jdbc:mysql://localhost:3306/mydb', 'dbuser', 'dbpass', 'com.mysql.jdbc.Driver') sql.query("SELECT COUNT(duration),SUM(duration) FROM logins WHERE username='$user'") { answer -> println (template.make(username:user, count:answer[0], total:answer[1])) } //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // using built-in connection features urlStr = 'http://jakarta.apache.org/' url = new URL(urlStr) connection = url.openConnection() connection.ifModifiedSince = new Date(2007,1,18).time connection.connect() println connection.responseCode // manually setting header field connection = url.openConnection() df = new java.text.SimpleDateFormat ("EEE, dd MMM yyyy HH:mm:ss 'GMT'") df.setTimeZone(TimeZone.getTimeZone('GMT')) connection.setRequestProperty("If-Modified-Since",df.format(new Date(2007,1,18))); connection.connect() println connection.responseCode //---------------------------------------------------------------------------------- |
//----------------------------------------------------------------------------------
// The website http://www.robotstxt.org/wc/active/html/ lists many available robots
// including Java ones which can be used from Groovy. In particular, j-spider
// allows you to:
// + Check your site for errors (internal server errors, ...)
// + Outgoing and/or internal link checking
// + Analyze your site structure (creating a sitemap, ...)
// + Download complete web sites
// most of its functionality is available by tweaking appropriate configuration
// files and then running it as a standalone application but you can also write
// your own java classes.
//----------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------- // sample data, use 'LOGFILE = new File(args[0]).text' or similar LOGFILE = ''' 127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)" 192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 192.168.0.1 - - [04/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)" ''' // similar to perl version: fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes'] regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/ LOGFILE.trim().split('\n').each{ line -> m = line =~ regex if (m.matches()) { for (idx in 0..<fields.size()) { println "${fields[idx]}=${m[0][idx+1]}" } println() } } //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- // sample data, use 'LOGFILE = new File(args[0]).text' or similar LOGFILE = ''' 204.31.113.138 - - [03/Jul/1996:06:56:12 -0800] "POST /forms/login.jsp HTTP/1.0" 200 5593 fcrawler.looksmart.com - - [26/Apr/2000:00:00:12 -0400] "GET /contacts.html HTTP/1.0" 200 4595 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)" fcrawler.looksmart.com - - [26/Apr/2000:00:17:19 -0400] "GET /news/news.html HTTP/1.0" 200 16716 "-" "FAST-WebCrawler/2.1-pre2 (ashen@looksmart.net)" ppp931.on.bellglobal.com - - [26/Apr/2000:00:16:12 -0400] "GET /download/windows/asctab31.zip HTTP/1.0" 200 1540096 "http://www.htmlgoodies.com/downloads/freeware/webdevelopment/15.html" "Mozilla/4.7 [en]C-SYMPA (Win95; U)" 123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/wpaper.gif HTTP/1.0" 200 6248 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)" 123.123.123.123 - - [26/Apr/2000:00:23:47 -0400] "GET /asctortf/ HTTP/1.0" 200 8130 "http://search.netscape.com/Computers/Data_Formats/Document/Text/RTF" "Mozilla/4.05 (Macintosh; I; PPC)" 123.123.123.123 - - [26/Apr/2000:00:23:48 -0400] "GET /pics/5star2000.gif HTTP/1.0" 200 4005 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)" 123.123.123.123 - - [27/Apr/2000:00:23:50 -0400] "GET /pics/5star.gif HTTP/1.0" 200 1031 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)" 123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /pics/a2hlogo.jpg HTTP/1.0" 200 4282 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)" 123.123.123.123 - - [27/Apr/2000:00:23:51 -0400] "GET /cgi-bin/newcount?jafsof3&width=4&font=digital&noshow HTTP/1.0" 200 36 "http://www.jafsoft.com/asctortf/" "Mozilla/4.05 (Macintosh; I; PPC)" 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET / HTTP/1.1" 200 1927 127.0.0.1 - - [04/Sep/2005:20:50:31 +0200] "GET /bus HTTP/1.1" 301 303 "-" "Opera/8.02 (X11; Linux i686; U; en)" 192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 192.168.0.1 - - [05/Sep/2005:20:50:36 +0200] "GET /bus/libjs/layersmenu-library.js HTTP/1.1" 200 6228 "http://localhost/bus/" "Opera/8.02 (X11; Linux i686; U; en)" ''' fields = ['client','identuser','authuser','date','time','tz','method','url','protocol','status','bytes'] regex = /^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+).*$/ class Summary { def hosts = [:] def what = [:] def accessCount = 0 def postCount = 0 def homeCount = 0 def totalBytes = 0 } totals = [:] LOGFILE.trim().split('\n').each{ line -> m = line =~ regex if (m.matches()) { date = m[0][fields.indexOf('date')+1] s = totals.get(date, new Summary()) s.accessCount++ if (m[0][fields.indexOf('method')+1] == 'POST') s.postCount++ s.totalBytes += (m[0][fields.indexOf('bytes')+1]).toInteger() def url = m[0][fields.indexOf('url')+1] if (url == '/') s.homeCount++ s.what[url] = s.what.get(url, 0) + 1 def host = m[0][fields.indexOf('client')+1] s.hosts[host] = s.hosts.get(host, 0) + 1 } } report('Date','Hosts','Accesses','Unidocs','POST','Home','Bytes') totals.each{ key, s -> report(key, s.hosts.size(), s.accessCount, s.what.size(), s.postCount, s.homeCount, s.totalBytes) } v = totals.values() report('Grand Total', v.sum{it.hosts.size()}, v.sum{it.accessCount}, v.sum{it.what.size()}, v.sum{it.postCount}, v.sum{it.homeCount}, v.sum{it.totalBytes} ) def report(a, b, c, d, e, f, g) { printf ("%12s %6s %8s %8s %8s %8s %10s\n", [a,b,c,d,e,f,g]) } // => // Date Hosts Accesses Unidocs POST Home Bytes // 03/Jul/1996 1 1 1 1 0 5593 // 10/Oct/2000 1 1 1 0 0 2326 // 04/Sep/2005 1 2 2 0 1 2230 // 05/Sep/2005 1 2 1 0 0 12456 // 26/Apr/2000 3 6 6 0 0 1579790 // 27/Apr/2000 1 3 3 0 0 5349 // Grand Total 8 15 14 1 1 1607744 // Some open source log processing packages in Java: // http://www.generationjava.com/projects/logview/index.shtml // http://ostermiller.org/webalizer/ // http://jxla.nvdcms.org/en/index.xml // http://polliwog.sourceforge.net/index.html // as well as textual reports, most of these can produce graphical reports // Most have their own configuration information and Java extension points. //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- import org.cyberneko.html.filters.Writer import org.cyberneko.html.filters.DefaultFilter import org.apache.xerces.xni.parser.XMLDocumentFilter import org.apache.xerces.xni.* import org.cyberneko.html.parsers.DOMParser import org.xml.sax.InputSource input = ''' <HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> <H1>Welcome to Scooby World!</H1> I have <A HREF="pictures.html">pictures</A> of the crazy dog himself. Here's one!<P> <IMG SRC="scooby.jpg" ALT="Good doggy!"><P> <BLINK>He's my hero!</BLINK> I would like to meet him some day, and get my picture taken with him.<P> P.S. I am deathly ill. <A HREF="shergold.html">Please send cards</A>. </BODY></HTML> ''' class WordReplaceFilter extends DefaultFilter { private before, after WordReplaceFilter(b, a) { before = b; after = a } void characters(XMLString text, Augmentations augs) { char[] c = text.toString().replaceAll(before, after) super.characters(new XMLString(c, 0, c.size()), augs) } void setProperty(String s, Object o){} } XMLDocumentFilter[] filters = [ new WordReplaceFilter(/(?sm)picture/, /photo/), new Writer() ] parser = new DOMParser() parser.setProperty("http://cyberneko.org/html/properties/filters", filters) parser.parse(new InputSource(new StringReader(input))) //---------------------------------------------------------------------------------- |
//---------------------------------------------------------------------------------- import org.cyberneko.html.filters.Writer import org.cyberneko.html.filters.DefaultFilter import org.apache.xerces.xni.parser.XMLDocumentFilter import org.apache.xerces.xni.* import org.cyberneko.html.parsers.DOMParser import org.xml.sax.InputSource input = ''' <HTML><HEAD><TITLE>Hi!</TITLE></HEAD><BODY> <H1>Welcome to Scooby World!</H1> I have <A HREF="pictures.html">pictures</A> of the crazy dog himself. Here's one!<P> <IMG SRC="scooby.jpg" ALT="Good doggy!"><P> <BLINK>He's my hero!</BLINK> I would like to meet him some day, and get my picture taken with him.<P> P.S. I am deathly ill. <A HREF="shergold.html">Please send cards</A>. </BODY></HTML> ''' class HrefReplaceFilter extends DefaultFilter { private before, after HrefReplaceFilter(b, a) { before = b; after = a } void startElement(QName element, XMLAttributes attributes, Augmentations augs) { def idx = attributes.getIndex('href') if (idx != -1) { def newtext = attributes.getValue(idx).replaceAll(before, after) attributes.setValue(idx, URLEncoder.encode(newtext)) } super.startElement(element, attributes, augs) } void setProperty(String s, Object o){} } XMLDocumentFilter[] myfilters = [ new HrefReplaceFilter(/shergold.html/, /cards.html/), new Writer() ] parser = new DOMParser() parser.setProperty("http://cyberneko.org/html/properties/filters", myfilters) parser.parse(new InputSource(new StringReader(input))) //---------------------------------------------------------------------------------- |