20. Web Automation

Introduction

(* Libraries for HTTP clients and servers are listed at The Caml Hump: *)
http://caml.inria.fr/cgi-bin/hump.en.cgi?browse=40

Fetching a URL from a Perl Script

(* If you just want to read a URL as a string, Ocamlnet's "Convenience"
   interface to Http_client is as easy as it gets. For the more powerful
   general interface to Http_client, see the next example. *)
#use "topfind";;
#require "netclient";;
open Http_client.Convenience
let content = http_get url

(*-----------------------------*)

#!/usr/bin/ocaml
(* titlebytes - find the title and size of documents  *)

#use "topfind";;
#require "str";;
#require "netclient";;

let raw_url = Sys.argv.(1)
let url = Neturl.parse_url raw_url
let () =
  Printf.printf "%s =>\n\t%!" (Neturl.string_of_url url);
  let call = new Http_client.get (Neturl.string_of_url url) in
  call#set_req_header "User-Agent" "Schmozilla/v9.14 Platinum";
  call#set_req_header "Referer" "http://wizard.yellowbrick.oz";
  let pipeline = new Http_client.pipeline in
  pipeline#add call;
  pipeline#run ();
  match call#status with
    | `Successful ->
        let content = call#get_resp_body () in
        let bytes = String.length content in
        let count = ref 0 in
        String.iter (function '\n' -> incr count | _ -> ()) content;
        let regexp =
          Str.regexp_case_fold ".*<title>\\([^<]*\\)</title>.*" in
        let title =
          try (ignore (Str.search_forward regexp content 0);
               Str.matched_group 1 content)
          with Not_found -> "(untitled)" in
        let title =
          Str.global_replace
            (Str.regexp "\\(^[\n\r\t ]+\\)\\|\\([\n\r\t ]+$\\)") ""
            title in
        Printf.printf "%s (%d lines, %d bytes)\n" title !count bytes
    | `Client_error ->
        Printf.eprintf "Client error: %d %s\n"
          call#response_status_code
          call#response_status_text
    | `Http_protocol_error e ->
        Printf.eprintf "HTTP protocol error: %s\n"
          (Printexc.to_string e)
    | `Redirection ->
        Printf.eprintf "Redirection\n"
    | `Server_error ->
        Printf.eprintf "Server error\n"
    | `Unserved ->
        assert false

Automating Form Submission

#use "topfind";;
#require "netclient";;
open Http_client.Convenience

(* Submit a form using GET. *)
let url = "http://www.perl.com/cgi-bin/cpan_mod?module=DB_File&readme=1"
let content = http_get url

(* Submit a form using POST. Since we need to follow a redirect here,
   we can't use the "Convenience" methods. *)
let url = "http://www.perl.com/cgi-bin/cpan_mod"
let params = ["module", "DB_File"; "readme", "1"]
let () =
  let call = new Http_client.post url params in
  call#set_redirect_mode Http_client.Redirect;
  let pipeline = new Http_client.pipeline in
  pipeline#add call;
  pipeline#run ()
let content = call#response_body#value

(* GET parameters can be URL encoded with Netencoding.Url.encode. *)
let arg = "\"this isn't <EASY> & <FUN>\""
Netencoding.Url.encode arg
(* - : string = "%22this+isn%27t+%3CEASY%3E+%26+%3CFUN%3E%22" *)
Netencoding.Url.encode ~plus:false arg
(* - : string = "%22this%20isn%27t%20%3CEASY%3E%20%26%20%3CFUN%3E%22" *)

(* To use a proxy, either set the "http_proxy" environment variable and
   call "set_proxy_from_environment" on the pipeline (done automatically
   for the "Convenience" methods) or set the proxy host and port using
   the "set_proxy" method: *)
let () = pipeline#set_proxy "localhost" 3128

Extracting URLs

(* The Nethtml library, part of Ocamlnet, can parse arbitrary HTML from
   files and web pages. *)

#use "topfind";;
#require "netstring";;

open Nethtml

(* Define a function to walk through all the elements in a document and
   accumulate the results of a user-supplied function for each element.
   This is known as a "fold" in functional programming. *)
let rec fold_elements f accu = function
  | Element (_, _, children) as element ->
      let accu =
        List.fold_right
          (fun child accu ->
             fold_elements f accu child)
          children
          accu in
      f accu element
  | other -> accu

(* Define a type for links so we can tell anchors and images apart. *)
type link = A of string | IMG of string

(* Using fold_elements, define a function that collects the URLs from
   all the "a" and "img" tags. *)
let find_links elements =
  List.flatten
    (List.map
       (fold_elements
          (fun accu element ->
             match element with
               | Element ("a", attribs, _) ->
                   (try A (List.assoc "href" attribs) :: accu
                    with Not_found -> accu)
               | Element ("img", attribs, _) ->
                   (try IMG (List.assoc "src" attribs) :: accu
                    with Not_found -> accu)
               | _ -> accu)
          [])
       elements)

(* Parse an HTML file. *)
let elements = parse (new Netchannels.input_channel (open_in filename))

(* Print the links we found. *)
let () =
  List.iter
    (function
       | A href -> Printf.printf "ANCHOR: %s\n" href
       | IMG src -> Printf.printf "IMAGE: %s\n" src)
    (find_links elements)

(*-----------------------------*)

#!/usr/bin/ocaml
(* xurl - extract unique, sorted list of links from URL *)

#use "topfind";;
#require "netclient";;

open Http_client.Convenience
open Nethtml

let rec fold_elements f accu = function
  | Element (_, _, children) as element ->
      let accu =
        List.fold_right
          (fun child accu ->
             fold_elements f accu child)
          children
          accu in
      f accu element
  | other -> accu

type link = A of string | IMG of string

let find_links elements =
  List.flatten
    (List.map
       (fold_elements
          (fun accu element ->
             match element with
               | Element ("a", attribs, _) ->
                   (try A (List.assoc "href" attribs) :: accu
                    with Not_found -> accu)
               | Element ("img", attribs, _) ->
                   (try IMG (List.assoc "src" attribs) :: accu
                    with Not_found -> accu)
               | _ -> accu)
          [])
       elements)

let base_url = Sys.argv.(1)
let elements = parse (new Netchannels.input_string (http_get base_url))
let url_syntax = Hashtbl.find Neturl.common_url_syntax "http"
let url_syntax =
  {url_syntax with
     Neturl.url_enable_fragment = Neturl.Url_part_allowed}
let url_syntax = Neturl.partial_url_syntax url_syntax

module StringSet = Set.Make(String)

let () =
  StringSet.iter print_endline
    (List.fold_left
       (fun accu s ->
          try
            StringSet.add
              (Neturl.string_of_url
                 (Neturl.apply_relative_url
                    (Neturl.url_of_string url_syntax base_url)
                    (Neturl.url_of_string url_syntax s)))
              accu
          with Neturl.Malformed_URL ->
            Printf.eprintf "Malformed URL: %s\n%!" s;
            accu)
       StringSet.empty
       (List.map
          (function
             | A href -> href
             | IMG src -> src)
          (find_links elements)))

Converting ASCII to HTML

#!/usr/bin/ocaml
(* text2html - trivial html encoding of normal text *)

#use "topfind";;
#require "str";;
#require "netstring";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let chop s =
  if s = "" then s else String.sub s 0 (String.length s - 1);;

let substitutions =
  [
    (* embedded URL (good) or guessed URL (bad) *)
    Str.regexp "\\(<URL:[^>]+>\\)\\|\\(http:[^ \n\r\t]+\\)",
    (fun s ->
       let s =
         if s.[0] = '<'
         then String.sub s 5 (String.length s - 6)
         else s in
       [Nethtml.Element ("a", ["href", s], [Nethtml.Data s])]);

    (* this is *bold* here *)
    Str.regexp "\\*[^*]+\\*",
    (fun s -> [Nethtml.Element ("strong", [], [Nethtml.Data s])]);

    (* this is _italics_ here *)
    Str.regexp "_[^ _]+_",
    (fun s -> [Nethtml.Element ("em", [], [Nethtml.Data s])]);
  ]

let substitute regexp func data =
  List.flatten
    (List.map
       (function
          | Str.Text s -> [Nethtml.Data s]
          | Str.Delim s -> func s)
       (Str.full_split regexp data))

let rec map_data f list =
  List.flatten
    (List.map
       (function
          | Nethtml.Data data -> f data
          | Nethtml.Element (name, attrs, children) ->
              [Nethtml.Element (name, attrs, map_data f children)])
       list)

let text2html text =
  (* Create the initial HTML tree. *)
  let html = [Nethtml.Data text] in

  (* Split text into lines. *)
  let html =
    List.flatten
      (List.map
         (function
            | Nethtml.Data data ->
                List.map
                  (fun line -> Nethtml.Data (line ^ "\n"))
                  ("" :: Str.split (Str.regexp "\n") data)
            | Nethtml.Element _ as e -> [e])
         html) in

  (* Perform inline substitutions. *)
  let html =
    List.fold_right
      (fun (regexp, func) ->
         map_data (substitute regexp func))
      substitutions
      html in

  (* Add line breaks to quoted text. *)
  let html =
    List.flatten
      (List.map
         (function
            | Nethtml.Data line when line.[0] = '>' ->
                [Nethtml.Data (chop line);
                 Nethtml.Element ("br", [], []);
                 Nethtml.Data "\n"]
            | Nethtml.Data line -> [Nethtml.Data line]
            | Nethtml.Element _ as e -> [e])
         html) in

  (* Return the finished document. *)
  html

let buffer = Buffer.create 0
let channel = new Netchannels.output_buffer buffer
let write html = Nethtml.write channel (Nethtml.encode html)
let paragraphs = paragraph_stream_of_channel stdin

(* Main loop *)
let () =
  let first = ref true in
  Stream.iter
    (fun para ->
       if !first then first := false
       else write [Nethtml.Data "\n\n"];
       (* Paragraphs beginning with whitespace are wrapped in <pre> *)
       let tag, body =
         if String.length para > 0 && String.contains " \t" para.[0]
         then "pre", [Nethtml.Data "\n";
                      Nethtml.Data para;  (* indented verbatim *)
                      Nethtml.Data "\n"]
         else "p", text2html para in      (* add paragraph tag *)
       write [Nethtml.Element (tag, [], body)])
    paragraphs;
  print_endline (Buffer.contents buffer)

(*-----------------------------*)

(* To format mail headers as a table, add the following just before
   the main loop. *)
let () =
  let colon_delim = Str.regexp "[ \t]*:[ \t]*" in
  let continuation = Str.regexp "\n[ \t]+" in
  try
    let headers = Stream.next paragraphs in
    let headers = Str.global_replace continuation " " headers in
    let lines = Str.split (Str.regexp "\n") headers in
    let rows =
      List.flatten
        (List.map
           (fun line ->
              (* parse heading *)
              let key, value =
                match Str.bounded_split_delim colon_delim line 2 with
                  | [key; value] -> key, value
                  | _ -> "", line in
              [Nethtml.Element
                 ("tr", [],
                  [Nethtml.Element
                     ("th", ["align", "left"], [Nethtml.Data key]);
                   Nethtml.Element
                     ("td", [], [Nethtml.Data value])]);
               Nethtml.Data "\n"])
           lines) in
    write [Nethtml.Element ("table", [], Nethtml.encode rows);
           Nethtml.Element ("hr", [], []);
           Nethtml.Data "\n\n"]
  with Stream.Failure -> ()

Converting HTML to ASCII

#load "unix.cma";;

let slurp_channel channel =
  let buffer_size = 4096 in
  let buffer = Buffer.create buffer_size in
  let string = String.create buffer_size in
  let chars_read = ref 1 in
  while !chars_read <> 0 do
    chars_read := input channel string 0 buffer_size;
    Buffer.add_substring buffer string 0 !chars_read
  done;
  Buffer.contents buffer

let () =
  let process = Unix.open_process_in ("lynx -dump " ^ filename) in
  let ascii = slurp_channel process in
  ignore (Unix.close_process_in process);
  (* ... *)

Extracting or Removing HTML Tags

(* Nethtml can be used to safely isolate and extract the text elements
   from an HTML document. *)
#use "topfind";;
#require "netstring";;

(* Load the HTML document. *)
let channel = new Netchannels.input_channel (open_in filename)
let html = Nethtml.parse channel
let () = channel#close_in ()

(* Convert the document to plain text. *)
let plain_text =
  let text = ref [] in
  let rec loop html =
    List.iter
      (function
         | Nethtml.Data s -> text := s :: !text
         | Nethtml.Element (_, _, children) -> loop children)
      html in
  loop (Nethtml.decode html);
  String.concat "" (List.rev !text)

(*-----------------------------*)

#!/usr/bin/ocaml
(* htitle - get html title from URL *)

#use "topfind";;
#require "str";;
#require "netclient";;

open Http_client.Convenience

let ltrim = Str.global_replace (Str.regexp "^[ \r\n\t\x00\x0B]*") ""
let rtrim = Str.global_replace (Str.regexp "[ \r\n\t\x00\x0B]*$") ""
let trim s = rtrim (ltrim s)

let find_title html =
  let title = ref "" in
  let rec loop = function
    | Nethtml.Element ("title", _, Nethtml.Data data :: _) ->
        title := trim data; raise Exit
    | Nethtml.Element (_, _, children) -> List.iter loop children
    | _ -> () in
  (try List.iter loop html with Exit -> ());
  !title

let urls =
  if Array.length Sys.argv > 1
  then List.tl (Array.to_list Sys.argv)
  else (Printf.eprintf "usage: %s url ...\n" Sys.argv.(0); exit 1)

let () =
  List.iter
    (fun url ->
       print_string (url ^ ": ");
       try
         let res = http_get url in
         let ch = new Netchannels.input_string res in
         let html = Nethtml.parse ch in
         print_endline (find_title html)
       with
         | Http_client.Http_error (status, _) ->
             Printf.printf "%d %s\n" status
               (Nethttp.string_of_http_status
                  (Nethttp.http_status_of_int status))
         | Failure s -> print_endline s)
    urls

Finding Stale Links

#!/usr/bin/ocaml
(* churl - check urls *)

#use "topfind";;
#require "netclient";;

open Http_client.Convenience
open Nethtml

let rec fold_elements f accu = function
  | Element (_, _, children) as element ->
      let accu =
        List.fold_right
          (fun child accu ->
             fold_elements f accu child)
          children
          accu in
      f accu element
  | other -> accu

type link = A of string | IMG of string

let find_links elements =
  List.flatten
    (List.map
       (fold_elements
          (fun accu element ->
             match element with
               | Element ("a", attribs, _) ->
                   (try A (List.assoc "href" attribs) :: accu
                    with Not_found -> accu)
               | Element ("img", attribs, _) ->
                   (try IMG (List.assoc "src" attribs) :: accu
                    with Not_found -> accu)
               | _ -> accu)
          [])
       elements)

let check_url url =
  Printf.printf "  %s: %s\n%!" url
    (match (http_head_message url)#status with
       | `Successful -> "OK"
       | _ -> "BAD")

let () =
  if Array.length Sys.argv <> 2
  then (Printf.eprintf "usage: %s <start_url>\n" Sys.argv.(0); exit 1)

let base_url = Sys.argv.(1)
let elements = parse (new Netchannels.input_string (http_get base_url))
let url_syntax = Hashtbl.find Neturl.common_url_syntax "http"
let url_syntax =
  {url_syntax with
     Neturl.url_enable_fragment = Neturl.Url_part_allowed}
let url_syntax = Neturl.partial_url_syntax url_syntax

module StringSet = Set.Make(String)

let () =
  print_endline (base_url ^ ":");
  StringSet.iter check_url
    (List.fold_left
       (fun accu s ->
          try
            StringSet.add
              (Neturl.string_of_url
                 (Neturl.apply_relative_url
                    (Neturl.url_of_string url_syntax base_url)
                    (Neturl.url_of_string url_syntax s)))
              accu
          with Neturl.Malformed_URL ->
            Printf.eprintf "Malformed URL: %s\n%!" s;
            accu)
       StringSet.empty
       (List.map
          (function
             | A href -> href
             | IMG src -> src)
          (find_links elements)))

Finding Fresh Links

#!/usr/bin/ocaml
(* surl - sort URLs by their last modification date *)

#use "topfind";;
#require "netclient";;

open Http_client.Convenience

let dates = ref []

let () =
  try
    while true do
      let url = input_line stdin in
      let call = http_head_message url in
      let date =
        try Some (Netdate.parse
                    (call#response_header#field "Last-Modified"))
        with Not_found -> None in
      dates := (date, url) :: !dates
    done
  with End_of_file -> ()

let () =
  List.iter
    (fun (date, url) ->
       Printf.printf "%-25s %s\n"
         (match date with
            | Some date -> Netdate.format "%a %b %d %H:%M:%S %Y" date
            | None -> "<NONE SPECIFIED>")
         url)
    (List.rev (List.sort compare !dates))

Creating HTML Templates

(* Template replacement using regular expressions from the Str module. *)
#load "str.cma";;

let slurp_channel channel =
  let buffer_size = 4096 in
  let buffer = Buffer.create buffer_size in
  let string = String.create buffer_size in
  let chars_read = ref 1 in
  while !chars_read <> 0 do
    chars_read := input channel string 0 buffer_size;
    Buffer.add_substring buffer string 0 !chars_read
  done;
  Buffer.contents buffer

let slurp_file filename =
  let channel = open_in_bin filename in
  let result =
    try slurp_channel channel
    with e -> close_in channel; raise e in
  close_in channel;
  result

let template_regexp = Str.regexp "%%\\([^%]+\\)%%"

let template filename fillings =
  let text = slurp_file filename in
  let eval s =
    try Hashtbl.find fillings s
    with Not_found -> "" in
  let replace _ =
    eval (Str.matched_group 1 text) in
  Str.global_substitute template_regexp replace text

(*-----------------------------*)

(* Alternative implementation: a hand-written stream parser. This version
   avoids loading the whole template into memory, so it is efficient for
   large files. *)

let template filename fillings =
  let f = open_in filename in
  try
    let buffer = Buffer.create (in_channel_length f) in
    let text = Stream.of_channel f in
    let eval s =
      try Hashtbl.find fillings s
      with Not_found -> "" in
    let rec search () =
      match Stream.peek text with
        | None -> ()
        | Some '%' ->
            Stream.junk text;
            (match Stream.peek text with
               | None ->
                   Buffer.add_char buffer '%';
                   search ()
               | Some '%' ->
                   Stream.junk text;
                   replace ""
               | Some c ->
                   Stream.junk text;
                   Buffer.add_char buffer '%';
                   Buffer.add_char buffer c;
                   search ())
        | Some c ->
            Stream.junk text;
            Buffer.add_char buffer c;
            search ()
    and replace acc =
      match Stream.peek text with
        | None ->
            Buffer.add_string buffer "%%";
            Buffer.add_string buffer acc
        | Some '%' ->
            Stream.junk text;
            (match Stream.peek text with
               | None ->
                   Buffer.add_string buffer "%%";
                   Buffer.add_string buffer acc;
                   Buffer.add_char buffer '%'
               | Some '%' ->
                   Stream.junk text;
                   Buffer.add_string buffer (eval acc);
                   search ()
               | Some c ->
                   Stream.junk text;
                   replace (acc ^ "%" ^ (String.make 1 c)))
        | Some c ->
            Stream.junk text;
            replace (acc ^ (String.make 1 c)) in
    search ();
    close_in f;
    Buffer.contents buffer
  with e ->
    close_in f;
    raise e

(*-----------------------------*)

(* simple.template contains the following:

<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for %%username%%</TITLE></HEAD>
<BODY><H1>Report for %%username%%</H1>
%%username%% logged in %%count%% times, for a total of %%total%% minutes.

*)

let () =
  let fields = Hashtbl.create 3 in
  Hashtbl.replace fields "username" whats_his_name;
  Hashtbl.replace fields "count" (string_of_int login_count);
  Hashtbl.replace fields "total" (string_of_int minute_used);
  print_endline (template "simple.template" fields)

(* Output:

<!-- simple.template for internal template() function -->
<HTML><HEAD><TITLE>Report for ramen</TITLE></HEAD>
<BODY><H1>Report for ramen</H1>
ramen logged in 42 times, for a total of 123 minutes.

*)

(*-----------------------------*)

(* userrep - report duration of user logins using SQL database *)

let process (cgi : Netcgi.cgi) =
  cgi#set_header ~content_type:"text/html" ();
  begin
    match cgi#argument_value "username" with
      | "" ->
          cgi#out_channel#output_string "No username"
      | user ->
          let db =
            Mysql.quick_connect
              ~user:"user"
              ~password:"seekritpassword"
              ~database:"connections" () in
          let sql = Printf.sprintf "
            SELECT COUNT(duration),SUM(duration)
            FROM logins WHERE username='%s'
          " (Mysql.escape user) in
          let result = Mysql.exec db sql in
          let default d = function Some x -> x | None -> d in
          let (count, total) =
            match Mysql.fetch result with
              | None -> ("0", "0")
              | Some row ->
                  (default "0" row.(0),
                   default "0" row.(1)) in
          (* template defined in the solution above *)
          let tpl = template "report.tpl" in
          let vars = Hashtbl.create 3 in
          Hashtbl.replace vars "username" user;
          Hashtbl.replace vars "count" count;
          Hashtbl.replace vars "total" total;
          cgi#out_channel#output_string (tpl vars)
  end;
  cgi#out_channel#commit_work ()

let () =
  let config = Netcgi.default_config in
  let buffered _ ch = new Netchannels.buffered_trans_channel ch in
  Netcgi_cgi.run ~config ~output_type:(`Transactional buffered) process

Mirroring Web Pages

#use "topfind";;
#require "unix";;
#require "netclient";;

let mirror url file =
  let call = new Http_client.get url in
  begin
    try
      let mtime = (Unix.stat file).Unix.st_mtime in
      let date = Netdate.mk_mail_date mtime in
      call#set_req_header "If-Modified-Since" date
    with Unix.Unix_error _ -> ()
  end;
  call#set_response_body_storage (`File (fun () -> file));
  let pipeline = new Http_client.pipeline in
  pipeline#add call;
  pipeline#run ();
  if call#response_status = `Ok
  then (let date =
          Netdate.parse
            (call#response_header#field "Last-Modified") in
        Unix.utimes file 0.0 (Netdate.since_epoch date));
  call#response_status

Creating a Robot

#load "str.cma";;

(* Parse "robots.txt" content from a stream of lines and return a
   list of user agents and a multi-valued hash table containing the
   rules for each user agent. *)
let parse_robots =
  let module S = Set.Make(struct
                            type t = string
                            let compare = compare
                          end) in

  (* Precompile regular expressions. *)
  let comments = Str.regexp "#.*" in
  let leading_white = Str.regexp "^[ \t]+" in
  let trailing_white = Str.regexp "[ \t\r]+$" in
  let colon_delim = Str.regexp "[ \t]*:[ \t]*" in

  fun stream ->
    let user_agent = ref "*" in
    let user_agents = ref (S.singleton "*") in
    let rules = Hashtbl.create 0 in

    Stream.iter
      (fun s ->
         let s = Str.replace_first comments "" s in
         let s = Str.replace_first leading_white "" s in
         let s = Str.replace_first trailing_white "" s in
         if String.length s > 0 then
           match Str.bounded_split_delim colon_delim s 2 with
             | ["User-agent"; value] ->
                 (* Found a new User-agent. *)
                 user_agent := value;
                 user_agents := S.add value !user_agents
             | ["Sitemap"; value] ->
                 (* Sitemaps are always global. *)
                 Hashtbl.add rules "*" ("Sitemap", value)
             | [key; value] ->
                 (* Found a rule for the current User-agent. *)
                 Hashtbl.add rules !user_agent (key, value)
             | _ -> failwith s)
      stream;
    S.elements !user_agents, rules

(* Produce a stream of lines from an input channel. *)
let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

(* Produce a stream of lines from a string in memory. *)
let line_stream_of_string string =
  Stream.of_list (Str.split (Str.regexp "\n") string)

(*-----------------------------*)

(* Use Ocamlnet to retrieve a "robots.txt" file and print its rules. *)

#use "topfind";;
#require "netclient";;
open Http_client.Convenience

let agents, rules =
  parse_robots
    (line_stream_of_string
       (http_get "http://sourceforge.net/robots.txt"))

let () =
  List.iter
    (fun agent ->
       Printf.printf "User-agent: %s\n" agent;
       List.iter
         (fun (key, value) ->
            Printf.printf "\t%s: %s\n" key value)
         (Hashtbl.find_all rules agent))
    agents

Parsing a Web Server Log File

(* Use the Weblogs library by Richard Jones:
   http://merjis.com/developers/weblogs

   You will also need the HostIP library:
   http://merjis.com/developers/hostip *)

let log = Weblogs.import_file "/var/log/apache2/access.log"

let () =
  Array.iter
    (fun {Weblogs.src_ip=client;
          remote_username=identuser;
          username=authuser;
          t=datetime;
          http_method=method';
          full_url=url;
          http_version=protocol;
          rcode=status;
          size=bytes;
          (* Many more fields are available.
             See Weblogs API documentation for details. *)
         } ->
       (* ... *)
       ())
    log

Processing Server Logs

#!/usr/bin/ocaml
(* sumwww - summarize web server log activity *)

#use "topfind";;
#require "weblogs";;

open Weblogs

let file =
  if Array.length Sys.argv = 2
  then Sys.argv.(1)
  else (Printf.eprintf "usage: %s <logfile>\n" Sys.argv.(0);
        exit 1)

let format_date = CalendarLib.Printer.CalendarPrinter.sprint "%d/%b/%Y"

let incr_hash hash key by =
  Hashtbl.replace hash key
    (try Hashtbl.find hash key + by
     with Not_found -> by)

let count_hash hash =
  let count = ref 0 in
  Hashtbl.iter (fun _ _ -> incr count) hash;
  !count

let add_hash dest src =
  Hashtbl.iter (incr_hash dest) src

let lastdate = ref ""

let count = ref 0
let posts = ref 0
let homes = ref 0
let bytesum = ref 0l
let hosts = ref (Hashtbl.create 0)
let whats = ref (Hashtbl.create 0)

let sumcount = ref 0
let allposts = ref 0
let allhomes = ref 0
let bytesumsum = ref 0l
let allhosts = ref (Hashtbl.create 0)
let allwhats = ref (Hashtbl.create 0)

(* display the tallies of hosts and URLs *)
let write_report () =
  Printf.printf "%s %7d %8d %8d %7d %7d %14ld\n%!"
    !lastdate (count_hash !hosts) !count (count_hash !whats)
    !posts !homes !bytesum;

  (* add to summary data *)
  sumcount := !sumcount + !count;
  bytesumsum := Int32.add !bytesumsum !bytesum;
  allposts := !allposts + !posts;
  allhomes := !allhomes + !homes;

  (* reset daily data *)
  count := 0;
  posts := 0;
  homes := 0;
  bytesum := 0l;
  add_hash !allhosts !hosts;
  add_hash !allwhats !whats;
  Hashtbl.clear !hosts;
  Hashtbl.clear !whats

(* read log file and tally hits from the host and to the URL *)
let daily_logs () =
  let log = import_file file in
  print_endline
    "    Date     Hosts  Accesses  Unidocs   POST    Home       Bytes";
  print_endline
    "----------- ------- -------- -------- ------- ------- --------------";
  Array.iter
    (fun row ->
       let date = format_date row.t in
       let host = row.src_ip in
       let what = row.url in
       let post = row.http_method = POST in
       let home = what = "/" in
       let bytes = match row.size with Some n -> n | None -> 0 in
       if !lastdate = "" then lastdate := date;
       if !lastdate <> date then write_report ();
       lastdate := date;
       incr count;
       if post then incr posts;
       if home then incr homes;
       incr_hash !hosts host 1;
       incr_hash !whats what 1;
       bytesum := Int32.add !bytesum (Int32.of_int bytes))
    log;
  if !count > 0 then write_report ()

let summary () =
  lastdate := "Grand Total";
  count := !sumcount;
  bytesum := !bytesumsum;
  hosts := !allhosts;
  posts := !allposts;
  whats := !allwhats;
  homes := !allhomes;
  write_report ()

let () =
  daily_logs ();
  summary ();
  exit 0

(*-----------------------------*)

#!/usr/bin/ocaml
(* aprept - report on Apache logs *)

#use "topfind";;
#require "weblogs";;

open Weblogs

let file =
  if Array.length Sys.argv = 2
  then Sys.argv.(1)
  else (Printf.eprintf "usage: %s <logfile>\n" Sys.argv.(0);
        exit 1)

let log = import_file file
let conn = HostIP.connection ()

let incr_hash hash key by =
  Hashtbl.replace hash key
    (try Hashtbl.find hash key + by
     with Not_found -> by)

let report_countries () =
  let total = ref 0 in
  let countries = Hashtbl.create 0 in
  Array.iter
    (fun row ->
       let country =
         match HostIP.get_country_name conn row.src_ip with
           | Some country -> country
           | None -> "UNKNOWN" in
       incr_hash countries country 1;
       incr total)
    log;
  let country_records = ref [] in
  Hashtbl.iter
    (fun country count ->
       country_records := (count, country) :: !country_records)
    countries;
  print_endline "Domain                  Records";
  print_endline "===============================";
  List.iter
    (fun (count, country) ->
       Printf.printf "%18s %5d %5.2f%%\n%!"
         country count (float count *. 100. /. float !total))
    (List.rev (List.sort compare !country_records))

let report_files () =
  let total = ref 0 in
  let totalbytes = ref 0l in
  let bytes = Hashtbl.create 0 in
  let records = Hashtbl.create 0 in
  Array.iter
    (fun row ->
       let file = row.url in
       let size = match row.size with Some n -> n | None -> 0 in
       incr_hash bytes file size;
       incr_hash records file 1;
       totalbytes := Int32.add !totalbytes (Int32.of_int size);
       incr total)
    log;
  let file_records = ref [] in
  Hashtbl.iter
    (fun file size ->
       let count = Hashtbl.find records file in
       file_records := (file, size, count) :: !file_records)
    bytes;
  print_endline
    "File                               Bytes          Records";
  print_endline
    "=========================================================";
  List.iter
    (fun (file, size, count) ->
       Printf.printf "%-22s %10d %5.2f%% %9d %5.2f%%\n%!"
         file size
         (float size *. 100. /. Int32.to_float !totalbytes)
         count
         (float count *. 100. /. float !total))
    (List.sort compare !file_records)

let () =
  report_countries ();
  print_newline ();
  report_files ()

Program: htmlsub

#!/usr/bin/ocaml
(* htmlsub - make substitutions in normal text of HTML files *)

#use "topfind";;
#require "str";;
#require "netstring";;

let usage () =
  Printf.eprintf "Usage: %s <from> <to> <file>...\n" Sys.argv.(0);
  exit 1

let from, to', files =
  match List.tl (Array.to_list Sys.argv) with
    | from :: to' :: files -> from, to', files
    | _ -> usage ()

let rec map_data f = function
  | Nethtml.Data data -> Nethtml.Data (f data)
  | Nethtml.Element (name, attribs, children) ->
      Nethtml.Element (name, attribs, List.map (map_data f) children)

let regexp = Str.regexp_string from
let buffer = Buffer.create 0
let out_channel = new Netchannels.output_buffer buffer
let write html = Nethtml.write out_channel (Nethtml.encode html)

let () =
  List.iter
    (fun file ->
       let in_channel = new Netchannels.input_channel (open_in file) in
       let html = Nethtml.decode (Nethtml.parse in_channel) in
       in_channel#close_in ();
       write (List.map (map_data (Str.global_replace regexp to')) html))
    files;
  print_endline (Buffer.contents buffer)

Program: hrefsub

#!/usr/bin/ocaml
(* hrefsub - make substitutions in <A HREF="..."> fields of HTML files *)

#use "topfind";;
#require "str";;
#require "netstring";;

let usage () =
  Printf.eprintf "Usage: %s <from> <to> <file>...\n" Sys.argv.(0);
  exit 1

let from, to', files =
  match List.tl (Array.to_list Sys.argv) with
    | from :: to' :: files -> from, to', files
    | _ -> usage ()

let rec map_attr tag attr f = function
  | Nethtml.Data _ as d -> d
  | Nethtml.Element (name, attribs, children)
      when tag = name && List.mem_assoc attr attribs ->
      let value = List.assoc attr attribs in
      Nethtml.Element (name,
                       (attr, f value)
                       :: List.remove_assoc attr attribs,
                       List.map (map_attr tag attr f) children)
  | Nethtml.Element (name, attribs, children) ->
      Nethtml.Element (name,
                       attribs,
                       List.map (map_attr tag attr f) children)

let regexp = Str.regexp_string from
let buffer = Buffer.create 0
let out_channel = new Netchannels.output_buffer buffer
let write html = Nethtml.write out_channel (Nethtml.encode html)

let () =
  List.iter
    (fun file ->
       let in_channel = new Netchannels.input_channel (open_in file) in
       let html = Nethtml.decode (Nethtml.parse in_channel) in
       in_channel#close_in ();
       write (List.map (map_attr "a" "href"
                          (Str.global_replace regexp to')) html))
    files;
  print_endline (Buffer.contents buffer)