PLEAC-Objective CAML
Prev		Next

6. Pattern Matching

Introduction

(* We will use the Str library distributed with OCaml for regular expressions.
 * There are two ways to use the str library, building a top or passing it to ocaml.
 * Under Unix, you can create a new toplevel which has the Str module:
 *      $ ocamlmktop -o strtop str.cma 
 *      $ ./strtop
 *   Now you don't need to prefix the contents of the str module with Str.
 * The alternative is to pass str.cma as a parameter:
 *     $ ocaml str.cma
 *   Now you may refer to the contents of the str module by using Str. 
 * Under Windows, if you are using ocamlwin.exe you can simply load Str:
 *      # load "str.cma";;
 *)
(* Str.search_forward returns an int or throws an exception if the pattern isn't found.
 * In Perl, the =~ operator returns null. Since these two values have different
 * types in OCaml, we cannot copy this behaviour directly.
 * Instead, we return an impossible index, -1 using try ... with.
 * Another method would be to define an =~ operator and use that directly:
# let (=~) s re = Str.string_match (Str.regexp re) s 0;; 
val ( =~ ) : string -> string -> bool = <fun>
# "abc" =~ "a";;
- : bool = true
# "bc" =~ "a";;
- : bool = false
 * Don't underestimate the power of this. Many of the following examples could be 
 * simplified by defining infix operators.
 *)
try Str.search_forward (Str.regexp pattern) string 0;
with Not_found -> -1;;

try Str.replace_first (Str.regexp pattern) replacement string;
with Not_found -> "";;
(*-----------------------------*)
try (Str.search_forward (Str.regexp "sheep") meadow 0) > -1;
with Not_found -> false;; (* true if meadow contains "sheep" *)

try not ((Str.search_forward (Str.regexp "sheep") meadow 0) > -1);
with Not_found -> true;; (* true if meadow doesn't contain "sheep" *)

let meadow = 
    try Str.replace_first (Str.regexp "old") "new" meadow;
    with Not_found -> meadow;; (* Replace "old" with "new" in meadow *)
(*-----------------------------*)
try 
    let temp = Str.search_forward (Str.regexp "\\bovines?\\b") meadow 0 in
        print_string "Here be sheep!";
with Not_found -> ();;
(*-----------------------------*)
let string = "good food" in
    try
        Str.replace_first (Str.regexp "o*") "e" string;
    with Not_found -> string;;
(*-----------------------------*)
(* There is no way to take command line parameters to ocaml that I know of. 
 * You would first have to compile your OCaml program using ocamlc.
 *)
(*-----------------------------*)
let rec match_num s start=
    if String.length s > 0 then
        try 
            let temp = Str.search_forward (Str.regexp "[0123456789]+") s start in
                print_string (String.concat "" ("Found number " :: Str.matched_string s :: ["\n"]));
                match_num s (temp + 1);
        with Not_found -> ();
    else
        ();;
(*-----------------------------*)
let rec match_group s start numbers=
    if String.length s > 0 then
        try 
            let temp = (Str.search_forward (Str.regexp "[0123456789]+") s start) in
                let numbers = Str.matched_string s :: numbers in 
                    match_group s (temp + 1) numbers;
        with Not_found -> numbers;
    else
        numbers;;
(*-----------------------------*)
let (=+) s re = 
    let result = ref [] in
    let offset = ref 0 in
    while ((String.length s) > !offset) do
        try
            offset := 1 + (Str.search_forward (Str.regexp re) s !offset);
            result := !result @ [Str.matched_string s] @ [];
        with Not_found -> ignore (offset := String.length s)
    done;
    result;;
    
let (=-) s re = 
    let result = ref [] in
    let offset = ref 0 in
    while ((String.length s) > !offset) do
        try
            ignore (Str.search_forward (Str.regexp re) s !offset);
            offset := Str.match_end ();
            result := !result @ [Str.matched_string s] @ [];
        with Not_found -> ignore (offset := String.length s)
    done;
    result;;

let digits = "123456789";;
let yeslap = digits =+ "[1234567890][1234567890][1234567890]";;
let nonlap = digits =- "[1234567890][1234567890][1234567890]";;

print_string "Non-overlapping: ";
List.iter (fun v -> print_string (v ^ " ")) !nonlap;
print_string "\n";;
(* Non-overlapping: 123 456 789 *)

print_string "Overlapping: ";
List.iter (fun v -> print_string (v ^ " ")) !yeslap;
print_string "\n";;
(* Overlapping: 123 234 345 456 567 678 789 *)
(*-----------------------------*)
let index = ref 0;;
let string = "And little lambs eat ivy";;
try
    index := Str.search_forward (Str.regexp "l[^s]*s") string 0;
with Not_found -> ();;
    
print_string ("(" ^ (String.sub string 0 !index) ^ ") ");
print_string ("(" ^ (Str.matched_string string) ^ ") ");
print_string ("(" ^ (Str.string_after string 16) ^ ")\n");;
(* (And ) (little lambs) ( eat ivy) *)

Copying and Substituting Simultaneously

#load "str.cma";;

(* The Str module doesn't modify strings in place; you always get
   a copy when you perform a substitution. *)
let dst = Str.global_replace (Str.regexp "this") "that" src

(* Strip to basename. *)
let progname = Str.replace_first (Str.regexp "^.*/") "" Sys.argv.(0)

(* Make All Words Title-Cased. *)
let capword =
  Str.global_substitute
    (Str.regexp "\\b.")
    (fun s -> String.uppercase (Str.matched_string s))
    words

(* /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1 *)
let catpage =
  Str.replace_first (Str.regexp "man\\([0-9]\\)") "cat\\1" manpage

(* Copy and substitute on all strings in a list. *)
let bindirs = ["/usr/bin"; "/bin"; "/usr/local/bin"]
let libdirs =
  List.map (fun s -> Str.replace_first (Str.regexp "bin") "lib" s)
    bindirs
(* ["/usr/lib"; "/lib"; "/usr/local/lib"] *)

Matching Letters

(* Str can do a simple character range match, but it isn't very
   practical for matching alphabetic characters in general. *)
#load "str.cma";;
let () =
  if Str.string_match (Str.regexp "^[A-Za-z]+$") var 0
  then print_endline "var is purely alphabetic"

(* With Pcre, you can use UTF8 support and match characters with
   the letter property. *)
#directory "+pcre";;
#load "pcre.cma";;
let () =
  if Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`UTF8] "^\\pL+$") var
  then print_endline "var is purely alphabetic"

Matching Words

#load "str.cma";;

(* Str's regexps lack a whitespace-matching pattern.
   Here is a substitute. *)
let whitespace_chars =
  String.concat ""
    (List.map (String.make 1)
       [
         Char.chr 9;  (* HT *)
         Char.chr 10; (* LF *)
         Char.chr 11; (* VT *)
         Char.chr 12; (* FF *)
         Char.chr 13; (* CR *)
         Char.chr 32; (* space *)
       ])
let space = "[" ^ whitespace_chars ^ "]"
let non_space = "[^" ^ whitespace_chars ^ "]"

(* as many non-whitespace characters as possible *)
let regexp = Str.regexp (non_space ^ "+")

(* as many letters, apostrophes, and hyphens *)
let regexp = Str.regexp "[A-Za-z'-]+"

(* usually best *)
let regexp = Str.regexp "\\b\\([A-Za-z]+\\)\\b"

(* fails at ends or w/ punctuation *)
let regexp = Str.regexp (space ^ "\\([A-Za-z]+\\)" ^ space)

Commenting Regular Expressions

#!/usr/bin/ocaml
(* resname - change all "foo.bar.com" style names in the input stream
   into "foo.bar.com [204.148.40.9]" (or whatever) instead *)

#directory "+pcre";;
#load "pcre.cma";;
#load "unix.cma";;

let regexp =
  Pcre.regexp ~flags:[`EXTENDED] "
    (                     # capture the hostname in substring 1
      (?:                 # these parens for grouping only
        (?! [-_]  )       # lookahead for neither underscore nor dash
        [\\w-] +          # hostname component
        \\.               # and the domain dot
      ) +                 # now repeat that whole thing a bunch of times
      [A-Za-z]            # next must be a letter
      [\\w-] +            # now trailing domain part
    )                     # end of substring 1 capture
  "

let process line =
  print_endline
    (Pcre.substitute_substrings
       ~rex:regexp
       ~subst:(fun subs ->
                 let name = Pcre.get_substring subs 1 in
                 let addr =
                   try
                     Unix.string_of_inet_addr
                       (Unix.gethostbyname name).Unix.h_addr_list.(0)
                   with Not_found -> "???" in
                 name ^ " [" ^ addr ^ "]")
       line)

let () =
  try
    while true do
      let line = read_line () in
      process line
    done
  with End_of_file -> ()

(*-----------------------------*)

let vars = Hashtbl.create 0
let () =
  Hashtbl.replace vars "name" "Bob";
  Hashtbl.replace vars "flavor" "rhubarb"

let () =
  print_endline
    (Pcre.substitute_substrings
       ~rex:(Pcre.regexp ~flags:[`EXTENDED] "
               \\#                #   a pound sign
               (\\w+)             #   the variable name
               \\#                #   another pound sign
             ")
       ~subst:(fun subs -> Hashtbl.find vars (Pcre.get_substring subs 1))
       "Hello, #name#, would you like some #flavor# pie?")

Finding the Nth Occurrence of a Match

#load "str.cma";;

let want = 3
let count = ref 0
let pond = "One fish two fish red fish blue fish"
let regexp = Str.regexp_case_fold "\\([a-z]+\\)[ ]+fish\\b"

exception Found of string
let () =
  let start = ref 0 in
  try
    while true do
      ignore (Str.search_forward regexp pond !start);
      start := !start + String.length (Str.matched_string pond);
      incr count;
      if !count = want then raise (Found (Str.matched_group 1 pond))
    done
  with
    | Found color ->
        Printf.printf "The third fish is a %s one.\n" color
    | Not_found ->
        Printf.printf "Only found %d fish!\n" !count

(* The third fish is a red one. *)

(*-----------------------------*)

let colors =
  let start = ref 0 in
  let fish = ref [] in
  begin
    try
      while true do
        ignore (Str.search_forward regexp pond !start);
        start := !start + (String.length (Str.matched_string pond));
        fish := (Str.matched_group 1 pond) :: !fish
      done;
    with Not_found -> ()
  end;
  Array.of_list (List.rev !fish)

let () =
  Printf.printf "The third fish in the pond is %s.\n" colors.(2)

(* The third fish in the pond is red. *)

(*-----------------------------*)

let evens =
  let colors' = ref [] in
  Array.iteri
    (fun i color -> if i mod 2 = 1 then colors' := color :: !colors')
    colors;
  List.rev !colors'
let () =
  Printf.printf "Even numbered fish are %s.\n" (String.concat " " evens)

(* Even numbered fish are two blue. *)

(*-----------------------------*)

let () =
  let count = ref 0 in
  print_endline
    (Str.global_substitute
       (Str.regexp_case_fold "\\b\\([a-z]+\\)\\([ ]+fish\\b\\)")
       (fun s ->
          incr count;
          if !count = 4
          then "sushi" ^ Str.matched_group 2 s
          else Str.matched_group 1 s ^ Str.matched_group 2 s)
       pond)

(* One fish two fish red fish sushi fish *)

(*-----------------------------*)

let pond = "One fish two fish red fish blue fish swim here."
let regexp = Str.regexp_case_fold "\\b\\([a-z]+\\)[ ]+fish\\b"
let colors =
  let rec loop start acc =
    try
      ignore (Str.search_forward regexp pond start);
      loop
        (start + String.length (Str.matched_string pond))
        (Str.matched_group 1 pond :: acc)
    with Not_found ->
      acc in
  loop 0 []
let color = List.hd colors
let () = Printf.printf "Last fish is %s.\n" color

(* Last fish is blue. *)

Matching Multiple Lines

#!/usr/bin/ocaml
(* killtags - very bad html tag killer *)
#load "str.cma";;
let regexp = Str.regexp "<[^>]*>"
let () =
  List.iter
    (fun filename ->
       let lines = ref [] in
       let in_channel = open_in filename in
       try
         begin
           try while true do lines := input_line in_channel :: !lines done
           with End_of_file -> ()
         end;
         let contents = String.concat "\n" (List.rev !lines) in
         print_endline
           (String.concat ""
              (List.map
                 (function
                    | Str.Text s -> s
                    | _ -> "")
                 (Str.full_split regexp contents)));
         close_in in_channel
       with e ->
         close_in in_channel;
         raise e)
    (List.tl (Array.to_list Sys.argv))

(*-----------------------------*)

#!/usr/bin/ocaml
(* headerfy - change certain chapter headers to html *)
#load "str.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let regexp = Str.regexp "^Chapter[\r\n\t ]+[0-9]+[\r\n\t ]*:[^\r\n]*"

let headerfy chunk =
  String.concat ""
    (List.map
       (function
          | Str.Text s -> s
          | Str.Delim s -> "<H1>" ^ s ^ "</H1>")
       (Str.full_split regexp chunk))

let () =
  List.iter
    (fun filename ->
       let in_channel = open_in filename in
       try
         Stream.iter
           (fun para ->
              print_endline (headerfy para);
              print_newline ())
           (paragraph_stream_of_channel in_channel);
         close_in in_channel
       with e ->
         close_in in_channel;
         raise e)
    (List.tl (Array.to_list Sys.argv))

Reading Records with a Pattern Separator

#load "str.cma";;
let chunks =
  let lines = ref [] in
  begin
    try while true do lines := input_line stdin :: !lines done
    with End_of_file -> ()
  end;
  let contents = String.concat "\n" (List.rev !lines) in
  Str.full_split (Str.regexp "^\\.\\(Ch\\|Se\\|Ss\\)$") contents
let () =
  Printf.printf
    "I read %d chunks.\n"
    (List.length chunks)

Extracting a Range of Lines

#load "str.cma";;

(* Creates a stream that produces ranges of items from another stream.
   Production of items starts when when (start_test count item) returns
   true and stops when (finish_test count item) returns true. Multiple
   ranges will be produced if start_test returns true again. The count
   starts at 1. Ranges are inclusive; the item that causes finish_test
   to return true will be produced. *)
let stream_range start_test finish_test stream =
  let active = ref false in
  let count = ref 1 in
  let rec next i =
    match Stream.peek stream with
      | None -> None
      | Some item ->
          if not !active then
            begin
              if start_test !count item
              then (active := true; next i)
              else (Stream.junk stream; incr count; next i)
            end
          else
            begin
              if finish_test !count item then active := false;
              Stream.junk stream;
              incr count;
              Some item
            end in
  Stream.from next

(* Creates a stream that produces items between a pair of indices.
   If start = 2 and finish = 4, items 2, 3, and 4 will be produced.
   The first item is number 1. *)
let stream_range_numbers start finish stream =
  stream_range
    (fun count _ -> count = start)
    (fun count _ -> count = finish)
    stream

(* Creates a stream that produces strings between a pair of regexps.
   The regexp will be tested using Str.string_match. *)
let stream_range_patterns start finish stream =
  stream_range
    (fun _ line -> Str.string_match start line 0)
    (fun _ line -> Str.string_match finish line 0)
    stream

(* Produce a stream of lines from an input channel. *)
let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

(* Print lines 15 through 17 inclusive. *)
let () =
  Stream.iter
    print_endline
    (stream_range_numbers 15 17
       (line_stream_of_channel (open_in datafile)))

(* Print out all <XMP> .. </XMP> displays from HTML doc. *)
let () =
  Stream.iter
    print_endline
    (stream_range_patterns
       (Str.regexp ".*<XMP>")
       (Str.regexp ".*</XMP>")
       (line_stream_of_channel stdin))

(*-----------------------------*)

let in_header = ref true
let in_body = ref false
let () =
  Stream.iter
    (fun line ->
       if !in_header && line = ""
       then (in_header := false; in_body := true)
       else
         begin
           (* do something with line *)
         end)
    (line_stream_of_channel stdin)

(*-----------------------------*)

module StringSet = Set.Make(String)
let seen = ref StringSet.empty
let email_regexp = Str.regexp "\\([^<>(),; \t]+@[^<>(),; \t]+\\)"
let () =
  Stream.iter
    (fun line ->
       List.iter
         (function
            | Str.Delim email ->
                if not (StringSet.mem email !seen)
                then
                  begin
                    seen := StringSet.add email !seen;
                    print_endline email;
                  end
            | _ -> ())
         (Str.full_split email_regexp line))
    (stream_range_patterns
       (Str.regexp "^From:?[ \t]")
       (Str.regexp "^$")
       (line_stream_of_channel stdin))

Matching Shell Globs as Regular Expressions

#load "str.cma";;

let regexp_string_of_glob s =
  let i, buffer = ref (-1), Buffer.create (String.length s + 8) in
  let read () =
    incr i;
    if !i < String.length s
    then Some s.[!i]
    else None in
  let write = Buffer.add_string buffer in
  let rec parse_glob () =
    match read () with
      | Some '*' -> write ".*"; parse_glob ()
      | Some '?' -> write "."; parse_glob ()
      | Some '[' -> parse_bracket ""
      | Some c -> write (Str.quote (String.make 1 c)); parse_glob ()
      | None -> ()
  and parse_bracket text =
    match read () with
      | Some '!' when text = "" -> parse_bracket "^"
      | Some ']' -> write ("[" ^ text ^ "]"); parse_glob ()
      | Some c -> parse_bracket (text ^ (String.make 1 c))
      | None -> write (Str.quote ("[" ^ text)) in
  write "^";
  parse_glob ();
  write "$";
  Buffer.contents buffer

let regexp_of_glob s =
  Str.regexp (regexp_string_of_glob s)

let regexp_of_glob_case_fold s =
  Str.regexp_case_fold (regexp_string_of_glob s)

Speeding Up Interpolated Matches

#load "str.cma";;

let popstates = ["CO"; "ON"; "MI"; "WI"; "MN"]

(* Naive version: Compile a regexp each time it is needed. *)
let popgrep1 () =
  try
    begin
      while true do
        let line = input_line stdin in
        try
          List.iter
            (fun state ->
               if (Str.string_match
                     (Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b"))
                     line 0)
               then (print_endline line; raise Exit))
            popstates
        with Exit -> ()
      done
    end
  with End_of_file -> ()

(* First optimization: Compile the regexps in advance. *)
let popgrep2 () =
  let popstate_regexps =
    List.map
      (fun state ->
         Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b"))
      popstates in
  try
    begin
      while true do
        let line = input_line stdin in
        try
          List.iter
            (fun regexp ->
               if (Str.string_match regexp line 0)
               then (print_endline line; raise Exit))
            popstate_regexps
        with Exit -> ()
      done
    end
  with End_of_file -> ()

(* Second optimization: Build a single regexp for all states. *)
let popgrep3 () =
  let popstates_regexp =
    Str.regexp
      (".*\\b\\("
       ^ (String.concat "\\|" (List.map Str.quote popstates))
       ^ "\\)\\b") in
  try
    begin
      while true do
        let line = input_line stdin in
        if Str.string_match popstates_regexp line 0
        then print_endline line
      done
    end
  with End_of_file -> ()

(* Speed tests with a 15,000 line input file: *)
let () = popgrep1 ()         (* time: 13.670s *)
let () = popgrep2 ()         (* time:  0.264s *)
let () = popgrep3 ()         (* time:  0.123s *)

Testing for a Valid Pattern

#load "str.cma";;
let () =
  while true do
    print_string "Pattern? ";
    flush stdout;
    let pattern = input_line stdin in
    try ignore (Str.regexp pattern)
    with Failure message ->
      Printf.printf "INVALID PATTERN: %s\n" message
  done

(*-----------------------------*)

let is_valid_pattern pattern =
  try ignore (Str.regexp pattern); true
  with Failure _ -> false

(*-----------------------------*)

#!/usr/bin/ocaml
(* paragrep - trivial paragraph grepper *)
#load "str.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let paragrep pat files =
  let regexp =
    begin
      try Str.regexp pat
      with Failure msg ->
        Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg;
        exit 1
    end in
  let count = ref 0 in
  List.iter
    (fun file ->
       let channel =
         if file = "-"
         then stdin
         else open_in file in
       try
         Stream.iter
           (fun para ->
              incr count;
              try
                ignore (Str.search_forward regexp para 0);
                Printf.printf "%s %d: %s\n\n" file !count para
              with Not_found -> ())
           (paragraph_stream_of_channel channel);
         close_in channel
       with e ->
         close_in channel;
         raise e)
    files

let () =
  match List.tl (Array.to_list Sys.argv) with
    | pat :: [] -> paragrep pat ["-"]
    | pat :: files -> paragrep pat files
    | [] -> Printf.eprintf "usage: %s pat [files]\n" Sys.argv.(0)

(*-----------------------------*)

let safe_pat = Str.quote pat

Honoring Locale Settings in Regular Expressions

(* OCaml does not provide a way to change the locale, and PCRE does
   not appear to be sensitive to the default locale. Regardless, Str
   does not support locales, and PCRE only matches ASCII characters
   for \w and friends. This example instead demonstrates the use of
   PCRE's UTF-8 support to match words, and it does not use locales. *)

#directory "+pcre";;
#load "pcre.cma";;

(* encoded as UTF-8 *)
let name = "andreas k\xc3\xb6nig"

(* the original regexp which is not Unicode-aware *)
let ascii_regexp = Pcre.regexp "\\b(\\w+)\\b"

(* a revised regexp which tests for Unicode letters and numbers *)
let utf8_regexp = Pcre.regexp ~flags:[`UTF8] "([\\pL\\pN]+)"

let () =
  List.iter
    (fun (enc, regexp) ->
       Printf.printf "%s names: %s\n" enc
         (String.concat " "
            (List.map
               String.capitalize
               (List.flatten
                  (Array.to_list
                     (Array.map
                        Array.to_list
                        (Pcre.extract_all
                           ~full_match:false
                           ~rex:regexp
                           name)))))))
    ["ASCII", ascii_regexp; "UTF-8", utf8_regexp]

(*
  ASCII names: Andreas K Nig
  UTF-8 names: Andreas König
*)

Approximate Matching

(* Calculates the Levenshtein, or edit distance, between two strings. *)
let levenshtein s t =
  let n = String.length s in
  let m = String.length t in
  match (m, n) with
    | (m, 0) -> m
    | (0, n) -> n
    | (m, n) ->
        let d = Array.init (m + 1) (fun x -> x) in
        let x = ref 0 in
        for i = 0 to n - 1 do
          let e = ref (i + 1) in
          for j = 0 to m - 1 do
            let cost = if s.[i] = t.[j] then 0 else 1 in
            x :=
              min
                (d.(j + 1) + 1)      (* insertion *)
                (min
                   (!e + 1)          (* deletion *)
                   (d.(j) + cost));  (* substitution *)
            d.(j) <- !e;
            e := !x
          done;
          d.(m) <- !x
        done;
        !x

(* Determines if two strings are an approximate match. *)
let amatch ?(percentage=20) s t =
  levenshtein s t * 100 / String.length s <= percentage

let () =
  let dict = open_in "/usr/dict/words" in
  try
    while true do
      let word = input_line dict in
      if amatch "balast" word
      then print_endline word
    done
  with End_of_file -> close_in dict

(*
  ballast
  blast
*)

Matching from Where the Last Pattern Left Off

#directory "+pcre";;
#load "pcre.cma";;

let s = "12 345 hello 6 7world89 10"
let rex = Pcre.regexp "(\\d+)"

let () =
  let subs = ref (Pcre.exec ~rex s) in
  try
    while true do
      Printf.printf "Found %s\n" (Pcre.get_substring !subs 1);
      subs := Pcre.next_match ~rex !subs
    done
  with Not_found -> ()

(*-----------------------------*)

let () =
  let n = "   49 here" in
  let n = Pcre.replace ~pat:"\\G " ~templ:"0" n in
  print_endline n

(* 00049 here *)

(*-----------------------------*)

let s = "3,4,5,9,120"
let rex = Pcre.regexp "\\G,?(\\d+)"

let () =
  let subs = ref (Pcre.exec ~rex s) in
  try
    while true do
      Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1);
      subs := Pcre.next_match ~rex !subs
    done
  with Not_found -> ()

(*-----------------------------*)

let s = "The year 1752 lost 10 days on the 3rd of September"

let rex = Pcre.regexp "(\\d+)"
let subs = ref (Pcre.exec ~rex s)

let () =
  try
    while true do
      Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1);
      subs := Pcre.next_match ~rex !subs
    done
  with Not_found -> ()

let () =
  let rex = Pcre.regexp "\\G(\\S+)" in
  subs := Pcre.next_match ~rex !subs;
  Printf.printf "Found %s after the last number.\n"
    (Pcre.get_substring !subs 1)

(*
  Found number 1752
  Found number 10
  Found number 3
  Found rd after the last number.
*)

(*-----------------------------*)

let () =
  match Pcre.get_substring_ofs !subs 1 with
    | (start, finish) ->
        Printf.printf
          "The position in 's' is %d..%d\n" start finish

(* The position in 's' is 35..37 *)

Greedy and Non-Greedy Matches

let s = "Even <TT>vi</TT> can edit <TT>troff</TT> effectively."

(* The Str library does not support non-greedy matches. In many cases,
   you can turn a non-greedy match into a greedy one, however: *)

#load "str.cma";;

let () = print_endline (Str.global_replace (Str.regexp "<.*>") "" s)
(* Even  effectively. *)
let () = print_endline (Str.global_replace (Str.regexp "<[^>]*>") "" s)
(* Even vi can edit troff effectively. *)

(* If you need non-greedy matches, you'll want to use PCRE instead: *)

#directory "+pcre";;
#load "pcre.cma";;

let () = print_endline (Pcre.replace ~pat:"<.*?>" ~templ:"" s)
(* Even vi can edit troff effectively. *)

(* Non-greedy matches don't always work the way you expect: *)

let s = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>"

let rex = Pcre.regexp "<b><i>(.*?)</i></b>"
let () = print_endline (Pcre.extract ~rex s).(1)
(* this</i> and <i>that</i> are important</b> Oh, <b><i>me too! *)

(* One solution is to use a non-grouping negative lookahead assertion: *)

let rex = Pcre.regexp "<b><i>((?:(?!</b>|</i>).)*)</i></b>"
let () = print_endline (Pcre.extract ~rex s).(1)
(* me too! *)

(* If performance is important, here is a faster technique: *)

let rex = Pcre.regexp ~flags:[`DOTALL; `EXTENDED] "
    <b><i>
    [^<]*  # stuff not possibly bad, and not possibly the end.
    (?:
 # at this point, we can have '<' if not part of something bad
     (?!  </?[ib]>  )   # what we can't have
     <                  # okay, so match the '<'
     [^<]*              # and continue with more safe stuff
    ) *
    </i></b>
"

let () = print_endline (Pcre.extract ~rex s).(0)
(* <b><i>me too!</i></b> *)

Detecting Duplicate Words

#directory "+pcre";;
#load "pcre.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let find_dup_words files =
  let rex = Pcre.regexp ~flags:[`CASELESS; `EXTENDED] "
      \\b            # start at a word boundary (begin letters)
      (\\S+)         # find chunk of non-whitespace
      \\b            # until another word boundary (end letters)
      (
          \\s+       # separated by some whitespace
          \\1        # and that very same chunk again
          \\b        # until another word boundary
      ) +            # one or more sets of those
  " in
  let count = ref 0 in
  List.iter
    (fun file ->
       let channel = if file = "-" then stdin else open_in file in
       try
         Stream.iter
           (fun para ->
              incr count;
              try
                let subs = ref (Pcre.exec ~rex para) in
                while true do
                  Printf.printf "dup word '%s' at paragraph %d.\n"
                    (Pcre.get_substring !subs 1)
                    !count;
                  flush stdout;
                  subs := Pcre.next_match ~rex !subs;
                done
              with Not_found -> ())
           (paragraph_stream_of_channel channel);
         close_in channel
       with e ->
         close_in channel;
         raise e)
    files

let () =
  match List.tl (Array.to_list Sys.argv) with
    | [] -> find_dup_words ["-"]
    | files -> find_dup_words files

(*-----------------------------*)

(*
  This is a test
  test of the duplicate word finder.
*)

(* dup word 'test' at paragraph 1. *)

(*-----------------------------*)

let a = "nobody"
let b = "bodysnatcher"
let () =
  try
    let subs =
      Pcre.exec
        ~pat:"^(\\w+)(\\w+) \\2(\\w+)$"
        (a ^ " " ^ b) in
    Printf.printf "%s overlaps in %s-%s-%s\n"
      (Pcre.get_substring subs 2)
      (Pcre.get_substring subs 1)
      (Pcre.get_substring subs 2)
      (Pcre.get_substring subs 3)
  with Not_found ->
    ()

(* body overlaps in no-body-snatcher *)

(*-----------------------------*)

#!/usr/bin/ocaml
(* prime_pattern -- find prime factors of argument using pattern matching *)
#directory "+pcre";;
#load "pcre.cma";;

let arg = try int_of_string Sys.argv.(1) with _ -> 0
let n = ref (String.make arg 'o')
let rex = Pcre.regexp "^(oo+?)\\1+$"
let templ = "o"
let () =
  try
    while true do
      let pat = Pcre.get_substring (Pcre.exec ~rex !n) 1 in
      Printf.printf "%d " (String.length pat);
      n := Pcre.replace ~pat ~templ !n
    done
  with Not_found ->
    Printf.printf "%d\n" (String.length !n)

(*-----------------------------*)

exception Found of (int * int * int)
let () =
  try
    match
      Pcre.extract
        ~full_match:false
        ~pat:"^(o*)\\1{11}(o*)\\2{14}(o*)\\3{15}$"
        (String.make 281 'o')
    with
      | [| x; y; z |] -> raise (Found
                                  (String.length x,
                                   String.length y,
                                   String.length z))
      | _ -> raise Not_found
  with
    | Found (x, y, z) ->
        Printf.printf "One solution is: x=%d; y=%d; z=%d.\n"
          x y z
    | Not_found ->
        Printf.printf "No solution.\n"

(* One solution is: x=17; y=3; z=2. *)

(*-----------------------------*)

~pat:"^(o+)\\1{11}(o+)\\2{14}(o+)\\3{15}$"
(* One solution is: x=17; y=3; z=2. *)

~pat:"^(o*?)\\1{11}(o*)\\2{14}(o*)\\3{15}$"
(* One solution is: x=0; y=7; z=11. *)

~pat:"^(o+?)\\1{11}(o*)\\2{14}(o*)\\3{15}$"
(* One solution is: x=1; y=3; z=14. *)

Expressing AND, OR, and NOT in a Single Pattern

#directory "+pcre";;
#load "pcre.cma";;

let pat = input_line config_channel
let () = if Pcre.pmatch ~pat data then (* ... *) ()

(*-----------------------------*)

(* alpha OR beta *)
let regexp = Pcre.regexp "alpha|beta"

(* alpha AND beta *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?=.*alpha)(?=.*beta)"

(* alpha AND beta, no overlap *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "alpha.*beta|beta.*alpha"

(* NOT pat *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?:(?!pat).)*$"

(* NOT bad BUT good *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "(?=(?:(?!bad).)*$)good"

(*-----------------------------*)

let () =
  if not (Pcre.pmatch ~rex:regexp text)
  then something ()

(*-----------------------------*)

let () =
  if (Pcre.pmatch ~rex:rexexp1 text) && (Pcre.pmatch ~rex:rexexp2 text)
  then something ()

(*-----------------------------*)

let () =
  if (Pcre.pmatch ~rex:rexexp1 text) || (Pcre.pmatch ~rex:rexexp2 text)
  then something ()

(*-----------------------------*)

#!/usr/bin/ocaml
(* minigrep - trivial grep *)
#directory "+pcre";;
#load "pcre.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let minigrep pat files =
  let rex =
    try Pcre.regexp pat
    with Pcre.BadPattern (msg, _) ->
      Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg;
      exit 1 in
  let process file =
    let channel = if file = "-" then stdin else open_in file in
    try
      Stream.iter
        (fun line -> if Pcre.pmatch ~rex line then print_endline line)
        (line_stream_of_channel channel);
      close_in channel
    with e ->
      close_in channel;
      raise e in
  List.iter process files

let () =
  match List.tl (Array.to_list Sys.argv) with
    | pat :: [] -> minigrep pat ["-"]
    | pat :: files -> minigrep pat files
    | [] -> Printf.eprintf "usage: %s pat [files]\n" Sys.argv.(0)

(*-----------------------------*)

let string = "labelled"

let () =
  Printf.printf "%b\n"
    (Pcre.pmatch
       ~rex:(Pcre.regexp ~flags:[`DOTALL] "^(?=.*bell)(?=.*lab)")
       string)

let () =
  Printf.printf "%b\n"
    (Pcre.pmatch ~pat:"bell" string && Pcre.pmatch ~pat:"lab" string)

let () =
  if (Pcre.pmatch
        ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] "
             ^              # start of string
            (?=             # zero-width lookahead
                .*          # any amount of intervening stuff
                bell        # the desired bell string
            )               # rewind, since we were only looking
            (?=             # and do the same thing
                .*          # any amount of intervening stuff
                lab         # and the lab part
            )")
        string)
  then print_endline "Looks like Bell Labs might be in Murray Hill!"

let () =
  Printf.printf "%b\n"
    (Pcre.pmatch ~pat:"(?:^.*bell.*lab)|(?:^.*lab.*bell)" string)

let brand = "labelled"
let () =
  if (Pcre.pmatch
        ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] "
        (?:                 # non-capturing grouper
            ^ .*?           # any amount of stuff at the front
              bell          # look for a bell
              .*?           # followed by any amount of anything
              lab           # look for a lab
          )                 # end grouper
    |                       # otherwise, try the other direction
        (?:                 # non-capturing grouper
            ^ .*?           # any amount of stuff at the front
              lab           # look for a lab
              .*?           # followed by any amount of anything
              bell          # followed by a bell
          )                 # end grouper
      ") brand)
  then print_endline "Our brand has bell and lab separate."

let map = "a map of the world"
let () =
  Printf.printf "%b\n"
    (Pcre.pmatch
       ~rex:(Pcre.regexp ~flags:[`DOTALL] "^(?:(?!waldo).)*$")
       map)

let () =
  if (Pcre.pmatch
        ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] "
        ^                   # start of string
        (?:                 # non-capturing grouper
            (?!             # look ahead negation
                waldo       # is he ahead of us now?
            )               # is so, the negation failed
            .               # any character (cuzza /s)
        ) *                 # repeat that grouping 0 or more
        $                   # through the end of the string
      ") map)
  then print_endline "There's no waldo here!"

(*-----------------------------*)

% w | minigrep '^(?!.*ttyp).*tchrist'

(*-----------------------------*)

Pcre.regexp ~flags:[`EXTENDED] "
    ^                       # anchored to the start
    (?!                     # zero-width look-ahead assertion
        .*                  # any amount of anything (faster than .*?)
        ttyp                # the string you don't want to find
    )                       # end look-ahead negation; rewind to start
    .*                      # any amount of anything (faster than .*?)
    tchrist                 # now try to find Tom
"

(*-----------------------------*)

% w | grep tchrist | grep -v ttyp

(*-----------------------------*)

% grep -i 'pattern' files
% minigrep '(?i)pattern' files

Matching Multiple-Byte Characters

#load "str.cma";;

(* Regexp text for an EUC-JP character *)
let eucjp =
  (String.concat "\\|"
     (* EUC-JP encoding subcomponents: *)
     [
       (* ASCII/JIS-Roman (one-byte/character) *)
       "[\x00-\x7F]";

       (* half-width katakana (two bytes/char) *)
       "\x8E[\xA0-\xDF]";

       (* JIS X 0212-1990 (three bytes/char) *)
       "\x8F[\xA1-\xFE][\xA1-\xFE]";

       (* JIS X 0208:1997 (two bytes/char) *)
       "[\xA1-\xFE][\xA1-\xFE]";
     ])

(* Match any number of EUC-JP characters preceding Tokyo *)
let regexp = Str.regexp ("\\(\\(" ^ eucjp ^ "\\)*\\)\\(\xC5\xEC\xB5\xFE\\)")

(* Search from the beginning for a match *)
let () =
  if Str.string_match regexp string 0
  then print_endline "Found Tokyo"

(* Replace Tokyo with Osaka *)
let () =
  let buffer = Buffer.create (String.length string) in
  let start = ref 0 in
  while Str.string_match regexp string !start do
    Buffer.add_string buffer (Str.matched_group 1 string);
    Buffer.add_string buffer osaka; (* Assuming osaka is defined *)
    start := Str.match_end ();
  done;
  if !start < String.length string
  then Buffer.add_substring buffer string
    !start (String.length string - !start);
  print_endline (Buffer.contents buffer)

(* Split a multi-byte string into characters *)
let () =
  (* One character per list element *)
  let chars =
    Array.map
      (function
         | Str.Delim c -> c
         | Str.Text c -> failwith ("invalid char: " ^ c))
      (Array.of_list
         (Str.full_split
            (Str.regexp eucjp)
            string)) in
  let length = Array.length chars in
  for i = 0 to length - 1 do
    if String.length chars.(i) = 1 then
      begin
        (* Do something interesting with this one-byte character *)
      end
    else
      begin
        (* Do something interesting with this multi-byte character *)
      end
  done;
  (* Glue list back together *)
  let line = String.concat "" (Array.to_list chars) in
  print_endline line

(* Determine if an entire string is valid EUC-JP *)
let is_eucjp s =
  Str.string_match
    (Str.regexp ("\\(" ^ eucjp ^ "\\)*$")) s 0

(* Assuming a similar string has been defined for Shift-JIS *)
let is_sjis s =
  Str.string_match
    (Str.regexp ("\\(" ^ sjis ^ "\\)*$")) s 0

(* Convert from EUC-JP to Unicode, assuming a Hashtbl named
   euc2uni is defined with the appropriate character mappings *)
let () =
  let chars =
    Array.map
      (function
         | Str.Delim c -> c
         | Str.Text c -> failwith ("invalid char: " ^ c))
      (Array.of_list
         (Str.full_split
            (Str.regexp eucjp)
            string)) in
  let length = Array.length chars in
  for i = 0 to length - 1 do
    if Hashtbl.mem euc2uni chars.(i)
    then
      begin
        chars.(i) <- (Hashtbl.find euc2uni chars.(i))
      end
    else
      begin
        (* deal with unknown EUC->Unicode mapping here *)
      end
  done;
  let line = String.concat "" (Array.to_list chars) in
  print_endline line

Matching a Valid Mail Address

#load "str.cma";;

(* Not foolproof, but works in most common cases. *)
let regexp =
  Str.regexp_case_fold
    "\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z][A-Z][A-Z]?[A-Z]?\\b"

let () =
  try
    while true do
      print_string "Email: ";
      flush stdout;
      let line = input_line stdin in
      try
        let start = ref 0 in
        while true do
          start := Str.search_forward regexp line !start;
          let string = Str.matched_string line in
          start := !start + String.length string;
          print_string "Found: ";
          print_endline string;
        done
      with Not_found -> ()
    done
  with End_of_file -> ()

Matching Abbreviations

#load "str.cma";;

let () =
  try
    while true do
      print_string "Action: ";
      flush stdout;
      let answer = input_line stdin in
      let regexp = Str.regexp_string_case_fold answer in
      if Str.string_match regexp "SEND" 0
      then print_endline "Action is send"
      else if Str.string_match regexp "STOP" 0
      then print_endline "Action is stop"
      else if Str.string_match regexp "ABORT" 0
      then print_endline "Action is abort"
      else if Str.string_match regexp "LIST" 0
      then print_endline "Action is list"
      else if Str.string_match regexp "EDIT" 0
      then print_endline "Action is edit"
    done
  with End_of_file -> ()

(*-----------------------------*)

(* assumes that invoke_editor, deliver_message, *)
(* file and pager are defined somewhere else. *)
let actions =
  [
    "edit", invoke_editor;
    "send", deliver_message;
    "list", (fun () -> ignore (Sys.command (pager ^ " " ^ file)));
    "abort", (fun () -> print_endline "See ya!"; exit 0);
  ]

let errors = ref 0

let () =
  try
    while true do
      print_string "Action: ";
      flush stdout;
      let answer = input_line stdin in
      (* trim leading white space *)
      let answer = Str.replace_first (Str.regexp "^[ \t]+") "" answer in
      (* trim trailing white space *)
      let answer = Str.replace_first (Str.regexp "[ \t]+$") "" answer in
      let regexp = Str.regexp_string_case_fold answer in
      let found = ref false in
      List.iter
        (fun (action, handler) ->
           if Str.string_match regexp action 0
           then (found := true; handler ()))
        actions;
      if not !found
      then (incr errors; print_endline ("Unknown command: " ^ answer))
    done
  with End_of_file -> ()

Program: urlify

#!/usr/bin/ocaml
(* urlify - wrap HTML links around URL-like constructs *)
#directory "+pcre";;
#load "pcre.cma";;

let urls = "(http|telnet|gopher|file|wais|ftp)"
let ltrs = "\\w"
let gunk = "/#~:.?+=&%@!\\-"
let punc = ".:?\\-"
let any  = ltrs ^ gunk ^ punc

let rex = Pcre.regexp ~flags:[`CASELESS; `EXTENDED]
  (Printf.sprintf "
      \\b                   # start at word boundary
      (                     # begin $1  {
       %s        :          # need resource and a colon
       [%s] +?              # followed by one or more
                            #  of any valid character, but
                            #  be conservative and take only
                            #  what you need to....
      )                     # end   $1  }
      (?=                   # look-ahead non-consumptive assertion
       [%s]*                # either 0 or more punctuation
       [^%s]                #   followed by a non-url char
       |                    # or else
       $                    #   then end of the string
      )
" urls any punc any)

let templ = "<A HREF=\"$1\">$1</A>"

let () =
  try
    while true do
      let line = input_line stdin in
      print_endline (Pcre.replace ~rex ~templ line)
    done
  with End_of_file ->
    ()

Program: tcgrep

% tcgrep -ril '^From: .*kate' ~/mail

(*-----------------------------*)

#!/usr/bin/ocaml
(* tcgrep: rewrite of tom christiansen's rewrite of grep *)
#load "unix.cma";;

(* http://ocaml.info/home/ocaml_sources.html#pcre-ocaml *)
#directory "+pcre";;
#load "pcre.cma";;

(* http://alain.frisch.fr/soft.html#Getopt *)
#directory "+getopt";;
#load "getopt.cma";;

(* Initialize globals. *)
let me = Pcre.replace ~pat:".*/" Sys.argv.(0)
let matches = ref 0
let errors = ref 0
let grand_total = ref 0
let mult = ref false
let compress = [".z", "zcat"; ".gz", "zcat"; ".Z", "zcat"]

(* Prints usage and exits. *)
let usage () =
  Printf.eprintf "usage: %s [flags] [files]

Standard grep options:
        i   case insensitive
        n   number lines
        c   give count of lines matching
        C   ditto, but >1 match per line possible
        w   word boundaries only
        s   silent mode
        x   exact matches only
        v   invert search sense (lines that DON'T match)
        h   hide filenames
        e   expression (for exprs beginning with -)
        f   file with expressions
        l   list filenames matching

Specials:
        1   1 match per file
        H   highlight matches
        u   underline matches
        r   recursive on directories or dot if none
        t   process directories in 'ls -t' order
        p   paragraph mode (default: line mode)
        P   ditto, but specify separator, e.g. -P '%%'
        a   all files, not just plain text files (not implemented)
        q   quiet about failed file and dir opens
        T   trace files as opened

May use a TCGREP environment variable to set default options.
" me;
  exit 255

(* Produces a stream of lines from an input channel. *)
let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

(* Produces a stream of chunks from an input channel given a delimiter. *)
let delimited_stream_of_channel delim channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some delim', [] when delim' = delim ->
          Stream.junk lines; next para_lines i
      | Some delim', _ when delim' = delim ->
          Some (String.concat "\n" (List.rev para_lines))
      | None, _ ->
          Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

(* An empty delimiter corresponds to an empty line, so we can create
   a paragraph stream in terms of the previous function. *)
let paragraph_stream_of_channel = delimited_stream_of_channel ""

(* By default, the stream builder will produce lines. This can be changed
   by the -p and -P options. *)
let stream_of_channel = ref line_stream_of_channel

(* Type for command-line options and their values. *)
type opt = OBool of bool | OStr of string

(* Set an option. *)
let opt_set opt c =
  Hashtbl.replace opt c (OBool true)

(* Test an option. *)
let opt_test opt c =
  try
    match Hashtbl.find opt c with
      | OBool b -> b
      | OStr "" -> false
      | OStr _ -> true
  with Not_found ->
    false

(* Convert an option to a string. *)
let opt_str opt c =
  try
    match Hashtbl.find opt c with
      | OBool b -> string_of_bool b
      | OStr s -> s
  with Not_found ->
    ""

(* Gets terminal escape characters. *)
let tput cap =
  let ch = Unix.open_process_in ("tput " ^ cap) in
  try
    let result = input_line ch in
    ignore (Unix.close_process_in ch);
    result
  with
    | End_of_file ->
        ignore (Unix.close_process_in ch);
        ""
    | e ->
        ignore (Unix.close_process_in ch);
        raise e

(* Splits a filename into its base and extension. *)
let splitext name =
  try
    let base = Filename.chop_extension name in
    let i = String.length base in
    let ext = String.sub name i (String.length name - i) in
    base, ext
  with Invalid_argument _ ->
    name, ""

(* Parses command-line arguments. *)
let parse_args () =
  let opt = Hashtbl.create 0 in
  let args = ref [] in

  let optstring = "incCwsxvhe:f:l1HurtpP:aqT" in
  let optstream = Stream.of_string optstring in

  (* Prepare options for Getopt. *)
  let opts =
    let str_setter c =
      (c, Getopt.nolong,
       None,
       Some (fun s -> Hashtbl.replace opt c (OStr s))) in
    let int_setter c =
      (c, Getopt.nolong,
       Some (fun () -> Hashtbl.replace opt c (OBool true)),
       None) in
    let rec loop acc =
      match Stream.peek optstream with
        | Some c ->
            (Stream.junk optstream;
             match Stream.peek optstream with
               | Some ':' ->
                   Stream.junk optstream;
                   loop (str_setter c :: acc)
               | _ ->
                   loop (int_setter c :: acc))
        | None -> List.rev acc in
    loop [] in

  (* Handle TCGREP environment variable. *)
  let cmdline = ref (List.tl (Array.to_list Sys.argv)) in
  Array.iter
    (fun env ->
       if (String.length env > 7
           && String.sub env 0 7 = "TCGREP=")
       then
         begin
           let s = String.sub env 7 (String.length env - 7) in
           let s = if s.[0] <> '-' then "-" ^ s else s in
           cmdline := s :: !cmdline
         end)
    (Unix.environment ());
  let cmdline = Array.of_list !cmdline in

  (* Parse command-line options using Getopt. *)
  begin
    try
      Getopt.parse
        opts (fun arg -> args := arg :: !args)
        cmdline 0 (Array.length cmdline - 1);
      args := List.rev !args
    with Getopt.Error e ->
      prerr_endline e;
      usage ()
  end;

  (* Read patterns from file or command line. *)
  let patterns =
    if opt_test opt 'f'
    then
      begin
        let in_channel =
          try open_in (opt_str opt 'f')
          with e ->
            Printf.eprintf "%s: can't open %s: %s\n"
              me (opt_str opt 'f') (Printexc.to_string e);
            exit 255 in
        try
          let acc = ref [] in
          Stream.iter
            (fun pat -> acc := pat :: !acc)
            (line_stream_of_channel in_channel);
          close_in in_channel;
          List.rev !acc
        with e ->
          close_in in_channel;
          Printf.eprintf "%s: error reading %s: %s\n"
            me (opt_str opt 'f') (Printexc.to_string e);
          exit 255
      end
    else if opt_test opt 'e'
    then [opt_str opt 'e']
    else [match !args with h :: t -> (args := t; h) | [] -> usage ()] in

  (* Terminal escape characters for highlighting options. *)
  let highlight =
    if opt_test opt 'H'
    then tput "smso" ^ "$1" ^ tput "rmso"
    else if opt_test opt 'u'
    then tput "smul" ^ "$1" ^ tput "rmul"
    else "$1" in

  (* Regular expression flags to use. *)
  let flags = ref [] in
  if opt_test opt 'i' then flags := `CASELESS :: !flags;

  (* Options for paragraph modes. *)
  if opt_test opt 'p'
  then stream_of_channel := paragraph_stream_of_channel;
  if opt_test opt 'P'
  then stream_of_channel := delimited_stream_of_channel (opt_str opt 'P');

  (* Word boundary option. *)
  let patterns =
    if opt_test opt 'w'
    then List.map (fun pat -> "\\b" ^ pat ^ "\\b") patterns
    else patterns in

  (* Exact match option. *)
  let patterns =
    if opt_test opt 'x'
    then List.map (fun pat -> "^" ^ pat ^ "$") patterns
    else patterns in

  (* Options that imply other options. *)
  if opt_test opt 'l' then opt_set opt '1';
  if opt_test opt 'u' then opt_set opt 'H';
  if opt_test opt 'C' then opt_set opt 'c';
  if opt_test opt 'c' then opt_set opt 's';
  if opt_test opt 's' && not (opt_test opt 'c') then opt_set opt '1';

  (* Compile the regular expression patterns. *)
  let rexes =
    List.map
      (fun pat ->
         try Pcre.regexp ~flags:!flags ("(" ^ pat ^ ")")
         with Pcre.BadPattern (msg, _) ->
           Printf.eprintf "%s: bad pattern %s: %s\n" me pat msg;
           exit 255)
      patterns in

  (* Increments the matches variable by the number of matches
     (or non-matches) in the given line. *)
  let count_matches line =
    if opt_test opt 'v'
    then fun rex ->
      (if not (Pcre.pmatch ~rex line) then incr matches)
    else if opt_test opt 'C'
    then fun rex ->
      (matches := !matches + (try Array.length (Pcre.extract_all ~rex line)
                              with Not_found -> 0))
    else fun rex ->
      (if Pcre.pmatch ~rex line then incr matches) in

  (* Counts matches in a line and returns the line with any
     necessary highlighting. *)
  let matcher line =
    List.iter (count_matches line) rexes;
    if opt_test opt 'H'
    then
      List.fold_left
        (fun line rex ->
           Pcre.replace ~rex ~templ:highlight line)
        line rexes
    else line in

  (* List of files or directories to process. *)
  let files =
    match !args with
      | [] -> if opt_test opt 'r' then ["."] else ["-"]
      | [arg] -> [arg]
      | args -> (mult := true; args) in

  (* Overrides for options that affect the multiple-file flag. *)
  if opt_test opt 'h' then mult := false;
  if opt_test opt 'r' then mult := true;

  (* Return the three values to be processed by matchfiles. *)
  opt, matcher, files

(* Used to break out of loops and abort processing of the current file. *)
exception NextFile

(* Runs given matcher on a list of files using the specified options. *)
let rec matchfiles opt matcher files =
  (* Handles a single directory. *)
  let matchdir dir =
    if not (opt_test opt 'r')
    then
      begin
        if opt_test opt 'T'
        then (Printf.eprintf "%s: \"%s\" is a directory, but no -r given\n"
                me dir;
              flush stderr)
      end
    else
      begin
        let files =
          try Some (Sys.readdir dir)
          with e ->
            if not (opt_test opt 'q')
            then (Printf.eprintf "%s: can't readdir %s: %s\n"
                    me dir (Printexc.to_string e);
                  flush stderr);
            incr errors;
            None in
        match files with
          | Some files ->
              let by_mtime a b =
                compare
                  (Unix.stat (Filename.concat dir b)).Unix.st_mtime
                  (Unix.stat (Filename.concat dir a)).Unix.st_mtime in
              if opt_test opt 't'
              then Array.sort by_mtime files;
              matchfiles opt matcher
                (Array.to_list
                   (Array.map
                      (Filename.concat dir)
                      files))
          | None -> ()
      end in

  (* Handles a single file. *)
  let matchfile file =
    (* Keep a running total of matches for this file. *)
    let total = ref 0 in

    (* Keep track of the current line number. *)
    let line_num = ref 0 in

    (* Shadow close_in to properly close process channels for compressed
       files and avoid closing stdin. *)
    let process_open = ref false in
    let close_in channel =
      if !process_open
      then ignore (Unix.close_process_in channel)
      else if channel != stdin then close_in channel in

    (* Process a line (or paragraph, with -p or -P) of input. *)
    let matchline line =
       incr line_num;
       matches := 0;
       let line = matcher line in
       if !matches > 0
       then
         begin
           total := !total + !matches;
           grand_total := !grand_total + !matches;
           if opt_test opt 'l'
           then (print_endline file; raise NextFile)
           else if not (opt_test opt 's')
           then (Printf.printf "%s%s%s%s\n"
                   (if !mult
                    then file ^ ":"
                    else "")
                   (if opt_test opt 'n'
                    then (string_of_int !line_num
                          ^ (if opt_test opt 'p' || opt_test opt 'P'
                             then ":\n"
                             else ":"))
                    else "")
                   line
                   (if opt_test opt 'p' || opt_test opt 'P'
                    then "\n" ^ String.make 20 '-'
                    else "");
                 flush stdout);
           if opt_test opt '1' then raise NextFile
         end in

    (* Get a channel for the file, starting a decompression process if
       necessary. If None, the file will be skipped. *)
    let maybe_channel =
      if file = "-"
      then (if Unix.isatty Unix.stdin && not (opt_test opt 'q')
            then (Printf.eprintf "%s: reading from stdin\n" me;
                  flush stderr);
            Some stdin)
      else if not (Sys.file_exists file)
      then (if not (opt_test opt 'q')
            then (Printf.eprintf "%s: file \"%s\" does not exist\n"
                    me file;
                  flush stderr);
            incr errors;
            None)
      else if List.mem_assoc (snd (splitext file)) compress
      then (process_open := true;
            try Some (Unix.open_process_in
                        (List.assoc (snd (splitext file)) compress
                         ^ " < " ^ file))
            with e ->
              if not (opt_test opt 'q')
              then (Printf.eprintf "%s: %s: %s\n" me file
                      (Printexc.to_string e);
                    flush stderr);
              incr errors;
              None)
      else (try Some (open_in file)
            with e ->
              if not (opt_test opt 'q')
              then (Printf.eprintf "%s: %s: %s\n" me file
                      (Printexc.to_string e);
                    flush stderr);
              incr errors;
              None) in

    (* Run matcher on the open channel, then close the channel. *)
    match maybe_channel with
      | None -> ()
      | Some channel ->
          begin
            try
              if opt_test opt 'T'
              then (Printf.eprintf "%s: checking %s\n" me file;
                    flush stderr);
              Stream.iter matchline (!stream_of_channel channel);
              close_in channel
            with
              | NextFile ->
                  close_in channel
              | e ->
                  close_in channel;
                  raise e
          end;
          if opt_test opt 'c'
          then (Printf.printf "%s%d\n"
                  (if !mult then file ^ ":" else "")
                  !total;
                flush stdout) in

  (* Handle each of the specified files and directories. *)
  List.iter
    (fun file ->
       if file = "-"
       then matchfile file
       else if try Sys.is_directory file with _ -> false
       then matchdir file
       else matchfile file)
    files

(* Parse command line arguments, run matcher, and set result status. *)
let opt, matcher, files = parse_args ()
let () =
  matchfiles opt matcher files;
  if !errors > 0 then exit 2;
  if !grand_total > 0 then exit 0;
  exit 1

Regular Expression Grabbag

#directory "+pcre";;
#load "pcre.cma";;

(*-----------------------------*)

Pcre.pmatch
  ~rex:(Pcre.regexp
          ~flags:[`CASELESS]
          "^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$")
  input

(*-----------------------------*)

Pcre.replace
  ~pat:"(\\S+)(\\s+)(\\S+)"
  ~templ:"$3$2$1"
  input

(*-----------------------------*)

Pcre.extract
  ~full_match:false
  ~pat:"(\\w+)\\s*=\\s*(.*)\\s*$"
  input

(*-----------------------------*)

Pcre.pmatch
  ~pat:".{80,}"
  input

(*-----------------------------*)

Pcre.extract
  ~full_match:false
  ~pat:"(\\d+)/(\\d+)/(\\d+) (\\d+):(\\d+):(\\d+)"
  input

(*-----------------------------*)

Pcre.replace
  ~pat:"/usr/bin"
  ~templ:"/usr/local/bin"
  input

(*-----------------------------*)

Pcre.substitute_substrings
  ~pat:"%([0-9A-Fa-f][0-9A-Fa-f])"
  ~subst:(fun subs ->
            let c = Pcre.get_substring subs 1 in
            String.make 1 (Char.chr (int_of_string ("0x" ^ c))))
  input

(*-----------------------------*)

Pcre.replace
  ~rex:(Pcre.regexp
          ~flags:[`DOTALL; `EXTENDED] "
            /\\*  # Match the opening delimiter
            .*?   # Match a minimal number of characters
            \\*/  # Match the closing delimiter
          ")
  input

(*-----------------------------*)

Pcre.replace ~pat:"^\\s+" input
Pcre.replace ~pat:"\\s+$" input

(*-----------------------------*)

Pcre.replace ~pat:"\\\\n" ~templ:"\n" input

(*-----------------------------*)

Pcre.replace ~pat:"^.*::" input

(*-----------------------------*)

Pcre.extract
  ~full_match:false
  ~pat:"^([01]?\\d\\d|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d|2[0-4]\\d|25[0-5])\\.\
         ([01]?\\d\\d|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d|2[0-4]\\d|25[0-5])$"
  input

(*-----------------------------*)

Pcre.replace ~pat:"^.*/" input

(*-----------------------------*)

let termcap = ":co#80:li#24:"
let cols =
  try int_of_string (Pcre.extract ~pat:":co#(\\d+):" termcap).(1)
  with Not_found | Failure "int_of_string" -> 80

(*-----------------------------*)

let name =
  Pcre.replace
    ~pat:" /\\S+/"
    ~templ:" "
    (" " ^ String.concat " " (Array.to_list Sys.argv))

(*-----------------------------*)

#load "unix.cma";;
let () =
  let ch = Unix.open_process_in "uname -a" in
  let os = input_line ch in
  ignore (Unix.close_process_in ch);
  if not (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`CASELESS] "linux") os)
  then print_endline "This isn't Linux"

(*-----------------------------*)

Pcre.replace ~pat:"\n\\s+" ~templ:" " input

(*-----------------------------*)

let nums =
  Array.map
    (fun group -> float_of_string group.(1))
    (Pcre.extract_all
       ~pat:"(\\d+\\.?\\d*|\\.\\d+)"
       input)

(*-----------------------------*)

let capwords =
  Array.map
    (fun group -> group.(1))
    (Pcre.extract_all
       ~pat:"(\\b[^\\Wa-z0-9_]+\\b)"
       input)

(*-----------------------------*)

let lowords =
  Array.map
    (fun group -> group.(1))
    (Pcre.extract_all
       ~pat:"(\\b[^\\WA-Z0-9_]+\\b)"
       input)

(*-----------------------------*)

let icwords =
  Array.map
    (fun group -> group.(1))
    (Pcre.extract_all
       ~pat:"(\\b[^\\Wa-z0-9_][^\\WA-Z0-9_]*\\b)"
       input)

(*-----------------------------*)

let links =
  Array.map
    (fun group -> group.(1))
    (Pcre.extract_all
       ~rex:(Pcre.regexp
               ~flags:[`DOTALL; `CASELESS]
               "<A[^>]+?HREF\\s*=\\s*[\"']?([^'\" >]+?)[ '\"]?>")
       input)

(*-----------------------------*)

let initial =
  try (Pcre.extract ~pat:"^\\S+\\s+(\\S)\\S*\\s+\\S" input).(1)
  with Not_found -> ""

(*-----------------------------*)

Pcre.replace ~pat:"\"([^\"]*)\"" ~templ:"``$1''" input

(*-----------------------------*)

let sentences =
  Array.map
    (fun group -> group.(1))
    (Pcre.extract_all
       ~pat:"(\\S.*?\\pP)(?=  |\\Z)"
       (Pcre.replace ~pat:" {3,}" ~templ:"  "
          (Pcre.replace ~pat:"\n" ~templ:" "
             (Pcre.replace ~pat:"(\\pP\n)" ~templ:"$1  "
                input))))

(*-----------------------------*)

Pcre.extract ~full_match:false ~pat:"(\\d{4})-(\\d\\d)-(\\d\\d)" input

(*-----------------------------*)

Pcre.pmatch
  ~pat:"^[01]?[- .]?(\\([2-9]\\d{2}\\)|[2-9]\\d{2})[- .]?\\d{3}[- .]?\\d{4}$"
  input

(*-----------------------------*)

Pcre.pmatch
  ~rex:(Pcre.regexp
          ~flags:[`CASELESS]
          "\\boh\\s+my\\s+gh?o(d(dess(es)?|s?)|odness|sh)\\b")
  input

(*-----------------------------*)

let lines =
  Array.map
    (fun group -> group.(1))
    (Pcre.extract_all
       ~pat:"([^\010\013]*)(\010\013?|\013\010?)"
       input)

Prev	Home	Next
Hashes		File Access