6. Pattern Matching

Introduction

(* We will use the Str library distributed with OCaml for regular expressions.
 * There are two ways to use the str library, building a top or passing it to ocaml.
 * Under Unix, you can create a new toplevel which has the Str module:
 *      $ ocamlmktop -o strtop str.cma 
 *      $ ./strtop
 *   Now you don't need to prefix the contents of the str module with Str.
 * The alternative is to pass str.cma as a parameter:
 *     $ ocaml str.cma
 *   Now you may refer to the contents of the str module by using Str. 
 * Under Windows, if you are using ocamlwin.exe you can simply load Str:
 *      # load "str.cma";;
 *)
(* Str.search_forward returns an int or throws an exception if the pattern isn't found.
 * In Perl, the =~ operator returns null. Since these two values have different
 * types in OCaml, we cannot copy this behaviour directly.
 * Instead, we return an impossible index, -1 using try ... with.
 * Another method would be to define an =~ operator and use that directly:
# let (=~) s re = Str.string_match (Str.regexp re) s 0;; 
val ( =~ ) : string -> string -> bool = <fun>
# "abc" =~ "a";;
- : bool = true
# "bc" =~ "a";;
- : bool = false
 * Don't underestimate the power of this. Many of the following examples could be 
 * simplified by defining infix operators.
 *)
try Str.search_forward (Str.regexp pattern) string 0;
with Not_found -> -1;;

try Str.replace_first (Str.regexp pattern) replacement string;
with Not_found -> "";;
(* ---------------------------- *)
try (Str.search_forward (Str.regexp "sheep") meadow 0) > -1;
with Not_found -> false;; (* true if meadow contains "sheep" *)

try not ((Str.search_forward (Str.regexp "sheep") meadow 0) > -1);
with Not_found -> true;; (* true if meadow doesn't contain "sheep" *)

let meadow = 
    try Str.replace_first (Str.regexp "old") "new" meadow;
    with Not_found -> meadow;; (* Replace "old" with "new" in meadow *)
(* ---------------------------- *)
try 
    let temp = Str.search_forward (Str.regexp "\\bovines?\\b") meadow 0 in
        print_string "Here be sheep!";
with Not_found -> ();;
(* ---------------------------- *)
let string = "good food" in
    try
        Str.replace_first (Str.regexp "o*") "e" string;
    with Not_found -> string;;
(* ---------------------------- *)
(* There is no way to take command line parameters to ocaml that I know of. 
 * You would first have to compile your OCaml program using ocamlc.
 *)
(* ---------------------------- *)
let rec match_num s start=
    if String.length s > 0 then
        try 
            let temp = Str.search_forward (Str.regexp "[0123456789]+") s start in
                print_string (String.concat "" ("Found number " :: Str.matched_string s :: ["\n"]));
                match_num s (temp + 1);
        with Not_found -> ();
    else
        ();;
(* ---------------------------- *)
let rec match_group s start numbers=
    if String.length s > 0 then
        try 
            let temp = (Str.search_forward (Str.regexp "[0123456789]+") s start) in
                let numbers = Str.matched_string s :: numbers in 
                    match_group s (temp + 1) numbers;
        with Not_found -> numbers;
    else
        numbers;;
(* ---------------------------- *)
let (=+) s re = 
    let result = ref [] in
    let offset = ref 0 in
    while ((String.length s) > !offset) do
        try
            offset := 1 + (Str.search_forward (Str.regexp re) s !offset);
            result := !result @ [Str.matched_string s] @ [];
        with Not_found -> ignore (offset := String.length s)
    done;
    result;;
    
let (=-) s re = 
    let result = ref [] in
    let offset = ref 0 in
    while ((String.length s) > !offset) do
        try
            ignore (Str.search_forward (Str.regexp re) s !offset);
            offset := Str.match_end ();
            result := !result @ [Str.matched_string s] @ [];
        with Not_found -> ignore (offset := String.length s)
    done;
    result;;

let digits = "123456789";;
let yeslap = digits =+ "[1234567890][1234567890][1234567890]";;
let nonlap = digits =- "[1234567890][1234567890][1234567890]";;

print_string "Non-overlapping: ";
List.iter (fun v -> print_string (v ^ " ")) !nonlap;
print_string "\n";;
(* Non-overlapping: 123 456 789 *)

print_string "Overlapping: ";
List.iter (fun v -> print_string (v ^ " ")) !yeslap;
print_string "\n";;
(* Overlapping: 123 234 345 456 567 678 789 *)
(* ---------------------------- *)
let index = ref 0;;
let string = "And little lambs eat ivy";;
try
    index := Str.search_forward (Str.regexp "l[^s]*s") string 0;
with Not_found -> ();;
    
print_string ("(" ^ (String.sub string 0 !index) ^ ") ");
print_string ("(" ^ (Str.matched_string string) ^ ") ");
print_string ("(" ^ (Str.string_after string 16) ^ ")\n");;
(* (And ) (little lambs) ( eat ivy) *)

Copying and Substituting Simultaneously

#load "str.cma";;

(* The Str module doesn't modify strings in place; you always get
   a copy when you perform a substitution. *)
let dst = Str.global_replace (Str.regexp "this") "that" src

(* Strip to basename. *)
let progname = Str.replace_first (Str.regexp "^.*/") "" Sys.argv.(0)

(* Make All Words Title-Cased. *)
let capword =
  Str.global_substitute
    (Str.regexp "\\b.")
    (fun s -> String.uppercase (Str.matched_string s))
    words

(* /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1 *)
let catpage =
  Str.replace_first (Str.regexp "man\\([0-9]\\)") "cat\\1" manpage

(* Copy and substitute on all strings in a list. *)
let bindirs = ["/usr/bin"; "/bin"; "/usr/local/bin"]
let libdirs =
  List.map (fun s -> Str.replace_first (Str.regexp "bin") "lib" s)
    bindirs
(* ["/usr/lib"; "/lib"; "/usr/local/lib"] *)

Matching Letters

(* Str can do a simple character range match, but it isn't very
   practical for matching alphabetic characters in general. *)
#load "str.cma";;
let () =
  if Str.string_match (Str.regexp "^[A-Za-z]+$") var 0
  then print_endline "var is purely alphabetic"

(* With Pcre, you can use UTF8 support and match characters with
   the letter property. *)
#directory "+pcre";;
#load "pcre.cma";;
let () =
  if Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`UTF8] "^\\pL+$") var
  then print_endline "var is purely alphabetic"

Matching Words

#load "str.cma";;

(* Str's regexps lack a whitespace-matching pattern.
   Here is a substitute. *)
let whitespace_chars =
  String.concat ""
    (List.map (String.make 1)
       [
         Char.chr 9;  (* HT *)
         Char.chr 10; (* LF *)
         Char.chr 11; (* VT *)
         Char.chr 12; (* FF *)
         Char.chr 13; (* CR *)
         Char.chr 32; (* space *)
       ])
let space = "[" ^ whitespace_chars ^ "]"
let non_space = "[^" ^ whitespace_chars ^ "]"

(* as many non-whitespace characters as possible *)
let regexp = Str.regexp (non_space ^ "+")

(* as many letters, apostrophes, and hyphens *)
let regexp = Str.regexp "[A-Za-z'-]+"

(* usually best *)
let regexp = Str.regexp "\\b\\([A-Za-z]+\\)\\b"

(* fails at ends or w/ punctuation *)
let regexp = Str.regexp (space ^ "\\([A-Za-z]+\\)" ^ space)

Commenting Regular Expressions

#!/usr/bin/ocaml
(* resname - change all "foo.bar.com" style names in the input stream
   into "foo.bar.com [204.148.40.9]" (or whatever) instead *)

#directory "+pcre";;
#load "pcre.cma";;
#load "unix.cma";;

let regexp =
  Pcre.regexp ~flags:[`EXTENDED] "
    (                     # capture the hostname in substring 1
      (?:                 # these parens for grouping only
        (?! [-_]  )       # lookahead for neither underscore nor dash
        [\\w-] +          # hostname component
        \\.               # and the domain dot
      ) +                 # now repeat that whole thing a bunch of times
      [A-Za-z]            # next must be a letter
      [\\w-] +            # now trailing domain part
    )                     # end of substring 1 capture
  "

let process line =
  print_endline
    (Pcre.substitute_substrings
       ~rex:regexp
       ~subst:(fun subs ->
                 let name = Pcre.get_substring subs 1 in
                 let addr =
                   try
                     Unix.string_of_inet_addr
                       (Unix.gethostbyname name).Unix.h_addr_list.(0)
                   with Not_found -> "???" in
                 name ^ " [" ^ addr ^ "]")
       line)

let () =
  try
    while true do
      let line = read_line () in
      process line
    done
  with End_of_file -> ()

(*-----------------------------*)

let vars = Hashtbl.create 0
let () =
  Hashtbl.replace vars "name" "Bob";
  Hashtbl.replace vars "flavor" "rhubarb"

let () =
  print_endline
    (Pcre.substitute_substrings
       ~rex:(Pcre.regexp ~flags:[`EXTENDED] "
               \\#                #   a pound sign
               (\\w+)             #   the variable name
               \\#                #   another pound sign
             ")
       ~subst:(fun subs -> Hashtbl.find vars (Pcre.get_substring subs 1))
       "Hello, #name#, would you like some #flavor# pie?")

Finding the Nth Occurrence of a Match

#load "str.cma";;

let want = 3
let count = ref 0
let pond = "One fish two fish red fish blue fish"
let regexp = Str.regexp_case_fold "\\([a-z]+\\)[ ]+fish\\b"

exception Found of string
let () =
  let start = ref 0 in
  try
    while true do
      ignore (Str.search_forward regexp pond !start);
      start := !start + String.length (Str.matched_string pond);
      incr count;
      if !count = want then raise (Found (Str.matched_group 1 pond))
    done
  with
    | Found color ->
        Printf.printf "The third fish is a %s one.\n" color
    | Not_found ->
        Printf.printf "Only found %d fish!\n" !count

(* The third fish is a red one. *)

(*-----------------------------*)

let colors =
  let start = ref 0 in
  let fish = ref [] in
  begin
    try
      while true do
        ignore (Str.search_forward regexp pond !start);
        start := !start + (String.length (Str.matched_string pond));
        fish := (Str.matched_group 1 pond) :: !fish
      done;
    with Not_found -> ()
  end;
  Array.of_list (List.rev !fish)

let () =
  Printf.printf "The third fish in the pond is %s.\n" colors.(2)

(* The third fish in the pond is red. *)

(*-----------------------------*)

let evens =
  let colors' = ref [] in
  Array.iteri
    (fun i color -> if i mod 2 = 1 then colors' := color :: !colors')
    colors;
  List.rev !colors'
let () =
  Printf.printf "Even numbered fish are %s.\n" (String.concat " " evens)

(* Even numbered fish are two blue. *)

(*-----------------------------*)

let () =
  let count = ref 0 in
  print_endline
    (Str.global_substitute
       (Str.regexp_case_fold "\\b\\([a-z]+\\)\\([ ]+fish\\b\\)")
       (fun s ->
          incr count;
          if !count = 4
          then "sushi" ^ Str.matched_group 2 s
          else Str.matched_group 1 s ^ Str.matched_group 2 s)
       pond)

(* One fish two fish red fish sushi fish *)

(*-----------------------------*)

let pond = "One fish two fish red fish blue fish swim here."
let regexp = Str.regexp_case_fold "\\b\\([a-z]+\\)[ ]+fish\\b"
let colors =
  let rec loop start acc =
    try
      ignore (Str.search_forward regexp pond start);
      loop
        (start + String.length (Str.matched_string pond))
        (Str.matched_group 1 pond :: acc)
    with Not_found ->
      acc in
  loop 0 []
let color = List.hd colors
let () = Printf.printf "Last fish is %s.\n" color

(* Last fish is blue. *)

Matching Multiple Lines

#!/usr/bin/ocaml
(* killtags - very bad html tag killer *)
#load "str.cma";;
let regexp = Str.regexp "<[^>]*>"
let () =
  List.iter
    (fun filename ->
       let lines = ref [] in
       let in_channel = open_in filename in
       try
         begin
           try while true do lines := input_line in_channel :: !lines done
           with End_of_file -> ()
         end;
         let contents = String.concat "\n" (List.rev !lines) in
         print_endline
           (String.concat ""
              (List.map
                 (function
                    | Str.Text s -> s
                    | _ -> "")
                 (Str.full_split regexp contents)));
         close_in in_channel
       with e ->
         close_in in_channel;
         raise e)
    (List.tl (Array.to_list Sys.argv))

(*-----------------------------*)

#!/usr/bin/ocaml
(* headerfy - change certain chapter headers to html *)
#load "str.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let regexp = Str.regexp "^Chapter[\r\n\t ]+[0-9]+[\r\n\t ]*:[^\r\n]*"

let headerfy chunk =
  String.concat ""
    (List.map
       (function
          | Str.Text s -> s
          | Str.Delim s -> "<H1>" ^ s ^ "</H1>")
       (Str.full_split regexp chunk))

let () =
  List.iter
    (fun filename ->
       let in_channel = open_in filename in
       try
         Stream.iter
           (fun para ->
              print_endline (headerfy para);
              print_newline ())
           (paragraph_stream_of_channel in_channel);
         close_in in_channel
       with e ->
         close_in in_channel;
         raise e)
    (List.tl (Array.to_list Sys.argv))

Reading Records with a Pattern Separator

#load "str.cma";;
let chunks =
  let lines = ref [] in
  begin
    try while true do lines := input_line stdin :: !lines done
    with End_of_file -> ()
  end;
  let contents = String.concat "\n" (List.rev !lines) in
  Str.full_split (Str.regexp "^\\.\\(Ch\\|Se\\|Ss\\)$") contents
let () =
  Printf.printf
    "I read %d chunks.\n"
    (List.length chunks)

Extracting a Range of Lines

#load "str.cma";;

(* Creates a stream that produces ranges of items from another stream.
   Production of items starts when when (start_test count item) returns
   true and stops when (finish_test count item) returns true. Multiple
   ranges will be produced if start_test returns true again. The count
   starts at 1. Ranges are inclusive; the item that causes finish_test
   to return true will be produced. *)
let stream_range start_test finish_test stream =
  let active = ref false in
  let count = ref 1 in
  let rec next i =
    match Stream.peek stream with
      | None -> None
      | Some item ->
          if not !active then
            begin
              if start_test !count item
              then (active := true; next i)
              else (Stream.junk stream; incr count; next i)
            end
          else
            begin
              if finish_test !count item then active := false;
              Stream.junk stream;
              incr count;
              Some item
            end in
  Stream.from next

(* Creates a stream that produces items between a pair of indices.
   If start = 2 and finish = 4, items 2, 3, and 4 will be produced.
   The first item is number 1. *)
let stream_range_numbers start finish stream =
  stream_range
    (fun count _ -> count = start)
    (fun count _ -> count = finish)
    stream

(* Creates a stream that produces strings between a pair of regexps.
   The regexp will be tested using Str.string_match. *)
let stream_range_patterns start finish stream =
  stream_range
    (fun _ line -> Str.string_match start line 0)
    (fun _ line -> Str.string_match finish line 0)
    stream

(* Produce a stream of lines from an input channel. *)
let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

(* Print lines 15 through 17 inclusive. *)
let () =
  Stream.iter
    print_endline
    (stream_range_numbers 15 17
       (line_stream_of_channel (open_in datafile)))

(* Print out all <XMP> .. </XMP> displays from HTML doc. *)
let () =
  Stream.iter
    print_endline
    (stream_range_patterns
       (Str.regexp ".*<XMP>")
       (Str.regexp ".*</XMP>")
       (line_stream_of_channel stdin))

(*-----------------------------*)

let in_header = ref true
let in_body = ref false
let () =
  Stream.iter
    (fun line ->
       if !in_header && line = ""
       then (in_header := false; in_body := true)
       else
         begin
           (* do something with line *)
         end)
    (line_stream_of_channel stdin)

(*-----------------------------*)

module StringSet = Set.Make(String)
let seen = ref StringSet.empty
let email_regexp = Str.regexp "\\([^<>(),; \t]+@[^<>(),; \t]+\\)"
let () =
  Stream.iter
    (fun line ->
       List.iter
         (function
            | Str.Delim email ->
                if not (StringSet.mem email !seen)
                then
                  begin
                    seen := StringSet.add email !seen;
                    print_endline email;
                  end
            | _ -> ())
         (Str.full_split email_regexp line))
    (stream_range_patterns
       (Str.regexp "^From:?[ \t]")
       (Str.regexp "^$")
       (line_stream_of_channel stdin))

Matching Shell Globs as Regular Expressions

let regexp_string_of_glob s =
  let i, buffer = ref (-1), Buffer.create (String.length s + 8) in
  let read () =
    try incr i; Some s.[!i]
    with Invalid_argument _ -> None in
  let write = Buffer.add_string buffer in
  let rec parse_glob () =
    match read () with
      | Some '*' -> write ".*"; parse_glob ()
      | Some '?' -> write "."; parse_glob ()
      | Some '[' -> parse_bracket ""
      | Some c -> write (Str.quote (String.make 1 c)); parse_glob ()
      | None -> ()
  and parse_bracket text =
    match read () with
      | Some '!' when text = "" -> parse_bracket "^"
      | Some ']' -> write ("[" ^ text ^ "]"); parse_glob ()
      | Some c -> parse_bracket (text ^ (String.make 1 c))
      | None -> write (Str.quote ("[" ^ text)) in
  write "^";
  parse_glob ();
  write "$";
  Buffer.contents buffer

let regexp_of_glob s = Str.regexp (regexp_string_of_glob s)

Speeding Up Interpolated Matches

#load "str.cma";;

let popstates = ["CO"; "ON"; "MI"; "WI"; "MN"]

(* Naive version: Compile a regexp each time it is needed. *)
let popgrep1 () =
  try
    begin
      while true do
        let line = input_line stdin in
        try
          List.iter
            (fun state ->
               if (Str.string_match
                     (Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b"))
                     line 0)
               then (print_endline line; raise Exit))
            popstates
        with Exit -> ()
      done
    end
  with End_of_file -> ()

(* First optimization: Compile the regexps in advance. *)
let popgrep2 () =
  let popstate_regexps =
    List.map
      (fun state ->
         Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b"))
      popstates in
  try
    begin
      while true do
        let line = input_line stdin in
        try
          List.iter
            (fun regexp ->
               if (Str.string_match regexp line 0)
               then (print_endline line; raise Exit))
            popstate_regexps
        with Exit -> ()
      done
    end
  with End_of_file -> ()

(* Second optimization: Build a single regexp for all states. *)
let popgrep3 () =
  let popstates_regexp =
    Str.regexp
      (".*\\b\\("
       ^ (String.concat "\\|" (List.map Str.quote popstates))
       ^ "\\)\\b") in
  try
    begin
      while true do
        let line = input_line stdin in
        if Str.string_match popstates_regexp line 0
        then print_endline line
      done
    end
  with End_of_file -> ()

(* Speed tests with a 15,000 line input file: *)
let () = popgrep1 ()         (* time: 13.670s *)
let () = popgrep2 ()         (* time:  0.264s *)
let () = popgrep3 ()         (* time:  0.123s *)

Testing for a Valid Pattern

#load "str.cma";;
let () =
  while true do
    print_string "Pattern? ";
    flush stdout;
    let pattern = input_line stdin in
    try ignore (Str.regexp pattern)
    with Failure message ->
      Printf.printf "INVALID PATTERN: %s\n" message
  done

(*-----------------------------*)

let is_valid_pattern pattern =
  try ignore (Str.regexp pattern); true
  with Failure _ -> false

(*-----------------------------*)

#!/usr/bin/ocaml
(* paragrep - trivial paragraph grepper *)
#load "str.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let paragrep pat files =
  let regexp =
    begin
      try Str.regexp pat
      with Failure msg ->
        Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg;
        exit 1
    end in
  let count = ref 0 in
  List.iter
    (fun file ->
       let channel =
         if file = "-"
         then stdin
         else open_in file in
       try
         Stream.iter
           (fun para ->
              incr count;
              try
                ignore (Str.search_forward regexp para 0);
                Printf.printf "%s %d: %s\n\n" file !count para
              with Not_found -> ())
           (paragraph_stream_of_channel channel);
         close_in channel
       with e ->
         close_in channel;
         raise e)
    files

let () =
  match List.tl (Array.to_list Sys.argv) with
    | pat :: [] -> paragrep pat ["-"]
    | pat :: files -> paragrep pat files
    | [] -> Printf.eprintf "usage: %s pat [files]\n" Sys.argv.(0)

(*-----------------------------*)

let safe_pat = Str.quote pat

Honoring Locale Settings in Regular Expressions

(* OCaml does not provide a way to change the locale, and PCRE does
   not appear to be sensitive to the default locale. Regardless, Str
   does not support locales, and PCRE only matches ASCII characters
   for \w and friends. This example instead demonstrates the use of
   PCRE's UTF-8 support to match words, and it does not use locales. *)

#directory "+pcre";;
#load "pcre.cma";;

(* encoded as UTF-8 *)
let name = "andreas k\xc3\xb6nig"

(* the original regexp which is not Unicode-aware *)
let ascii_regexp = Pcre.regexp "\\b(\\w+)\\b"

(* a revised regexp which tests for Unicode letters and numbers *)
let utf8_regexp = Pcre.regexp ~flags:[`UTF8] "([\\pL\\pN]+)"

let () =
  List.iter
    (fun (enc, regexp) ->
       Printf.printf "%s names: %s\n" enc
         (String.concat " "
            (List.map
               String.capitalize
               (List.flatten
                  (Array.to_list
                     (Array.map
                        Array.to_list
                        (Pcre.extract_all
                           ~full_match:false
                           ~rex:regexp
                           name)))))))
    ["ASCII", ascii_regexp; "UTF-8", utf8_regexp]

(*
  ASCII names: Andreas K Nig
  UTF-8 names: Andreas König
*)

Approximate Matching

(* Calculates the Levenshtein, or edit distance, between two strings. *)
let levenshtein s t =
  let n = String.length s in
  let m = String.length t in
  match (m, n) with
    | (m, 0) -> m
    | (0, n) -> n
    | (m, n) ->
        let d = Array.init (m + 1) (fun x -> x) in
        let x = ref 0 in
        for i = 0 to n - 1 do
          let e = ref (i + 1) in
          for j = 0 to m - 1 do
            let cost = if s.[i] = t.[j] then 0 else 1 in
            x :=
              min
                (d.(j + 1) + 1)      (* insertion *)
                (min
                   (!e + 1)          (* deletion *)
                   (d.(j) + cost));  (* substitution *)
            d.(j) <- !e;
            e := !x
          done;
          d.(m) <- !x
        done;
        !x

(* Determines if two strings are an approximate match. *)
let amatch ?(percentage=20) s t =
  levenshtein s t * 100 / String.length s <= percentage

let () =
  let dict = open_in "/usr/dict/words" in
  try
    while true do
      let word = input_line dict in
      if amatch "balast" word
      then print_endline word
    done
  with End_of_file -> close_in dict

(*
  ballast
  blast
*)

Matching from Where the Last Pattern Left Off

#directory "+pcre";;
#load "pcre.cma";;

let s = "12 345 hello 6 7world89 10"
let rex = Pcre.regexp "(\\d+)"

let () =
  let subs = ref (Pcre.exec ~rex s) in
  try
    while true do
      Printf.printf "Found %s\n" (Pcre.get_substring !subs 1);
      subs := Pcre.next_match ~rex !subs
    done
  with Not_found -> ()

(*-----------------------------*)

let () =
  let n = "   49 here" in
  let n = Pcre.replace ~pat:"\\G " ~templ:"0" n in
  print_endline n

(* 00049 here *)

(*-----------------------------*)

let s = "3,4,5,9,120"
let rex = Pcre.regexp "\\G,?(\\d+)"

let () =
  let subs = ref (Pcre.exec ~rex s) in
  try
    while true do
      Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1);
      subs := Pcre.next_match ~rex !subs
    done
  with Not_found -> ()

(*-----------------------------*)

let s = "The year 1752 lost 10 days on the 3rd of September"

let rex = Pcre.regexp "(\\d+)"
let subs = ref (Pcre.exec ~rex s)

let () =
  try
    while true do
      Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1);
      subs := Pcre.next_match ~rex !subs
    done
  with Not_found -> ()

let () =
  let rex = Pcre.regexp "\\G(\\S+)" in
  subs := Pcre.next_match ~rex !subs;
  Printf.printf "Found %s after the last number.\n"
    (Pcre.get_substring !subs 1)

(*
  Found number 1752
  Found number 10
  Found number 3
  Found rd after the last number.
*)

(*-----------------------------*)

let () =
  match Pcre.get_substring_ofs !subs 1 with
    | (start, finish) ->
        Printf.printf
          "The position in 's' is %d..%d\n" start finish

(* The position in 's' is 35..37 *)

Greedy and Non-Greedy Matches

let s = "Even <TT>vi</TT> can edit <TT>troff</TT> effectively."

(* The Str library does not support non-greedy matches. In many cases,
   you can turn a non-greedy match into a greedy one, however: *)

#load "str.cma";;

let () = print_endline (Str.global_replace (Str.regexp "<.*>") "" s)
(* Even  effectively. *)
let () = print_endline (Str.global_replace (Str.regexp "<[^>]*>") "" s)
(* Even vi can edit troff effectively. *)

(* If you need non-greedy matches, you'll want to use PCRE instead: *)

#directory "+pcre";;
#load "pcre.cma";;

let () = print_endline (Pcre.replace ~pat:"<.*?>" ~templ:"" s)
(* Even vi can edit troff effectively. *)

(* Non-greedy matches don't always work the way you expect: *)

let s = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>"

let rex = Pcre.regexp "<b><i>(.*?)</i></b>"
let () = print_endline (Pcre.extract ~rex s).(1)
(* this</i> and <i>that</i> are important</b> Oh, <b><i>me too! *)

(* One solution is to use a non-grouping negative lookahead assertion: *)

let rex = Pcre.regexp "<b><i>((?:(?!</b>|</i>).)*)</i></b>"
let () = print_endline (Pcre.extract ~rex s).(1)
(* me too! *)

(* If performance is important, here is a faster technique: *)

let rex = Pcre.regexp ~flags:[`DOTALL; `EXTENDED] "
    <b><i>
    [^<]*  # stuff not possibly bad, and not possibly the end.
    (?:
 # at this point, we can have '<' if not part of something bad
     (?!  </?[ib]>  )   # what we can't have
     <                  # okay, so match the '<'
     [^<]*              # and continue with more safe stuff
    ) *
    </i></b>
"

let () = print_endline (Pcre.extract ~rex s).(0)
(* <b><i>me too!</i></b> *)

Detecting Duplicate Words

#directory "+pcre";;
#load "pcre.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let paragraph_stream_of_channel channel =
  let lines = line_stream_of_channel channel in
  let rec next para_lines i =
    match Stream.peek lines, para_lines with
      | None, [] -> None
      | Some "", [] -> Stream.junk lines; next para_lines i
      | Some "", _
      | None, _ -> Some (String.concat "\n" (List.rev para_lines))
      | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in
  Stream.from (next [])

let find_dup_words files =
  let rex = Pcre.regexp ~flags:[`CASELESS; `EXTENDED] "
      \\b            # start at a word boundary (begin letters)
      (\\S+)         # find chunk of non-whitespace
      \\b            # until another word boundary (end letters)
      (
          \\s+       # separated by some whitespace
          \\1        # and that very same chunk again
          \\b        # until another word boundary
      ) +            # one or more sets of those
  " in
  let count = ref 0 in
  List.iter
    (fun file ->
       let channel = if file = "-" then stdin else open_in file in
       try
         Stream.iter
           (fun para ->
              incr count;
              try
                let subs = ref (Pcre.exec ~rex para) in
                while true do
                  Printf.printf "dup word '%s' at paragraph %d.\n"
                    (Pcre.get_substring !subs 1)
                    !count;
                  flush stdout;
                  subs := Pcre.next_match ~rex !subs;
                done
              with Not_found -> ())
           (paragraph_stream_of_channel channel);
         close_in channel
       with e ->
         close_in channel;
         raise e)
    files

let () =
  match List.tl (Array.to_list Sys.argv) with
    | [] -> find_dup_words ["-"]
    | files -> find_dup_words files

(*-----------------------------*)

(*
  This is a test
  test of the duplicate word finder.
*)

(* dup word 'test' at paragraph 1. *)

(*-----------------------------*)

let a = "nobody"
let b = "bodysnatcher"
let () =
  try
    let subs =
      Pcre.exec
        ~pat:"^(\\w+)(\\w+) \\2(\\w+)$"
        (a ^ " " ^ b) in
    Printf.printf "%s overlaps in %s-%s-%s\n"
      (Pcre.get_substring subs 2)
      (Pcre.get_substring subs 1)
      (Pcre.get_substring subs 2)
      (Pcre.get_substring subs 3)
  with Not_found ->
    ()

(* body overlaps in no-body-snatcher *)

(*-----------------------------*)

#!/usr/bin/ocaml
(* prime_pattern -- find prime factors of argument using pattern matching *)
#directory "+pcre";;
#load "pcre.cma";;

let arg = try int_of_string Sys.argv.(1) with _ -> 0
let n = ref (String.make arg 'o')
let rex = Pcre.regexp "^(oo+?)\\1+$"
let templ = "o"
let () =
  try
    while true do
      let pat = Pcre.get_substring (Pcre.exec ~rex !n) 1 in
      Printf.printf "%d " (String.length pat);
      n := Pcre.replace ~pat ~templ !n
    done
  with Not_found ->
    Printf.printf "%d\n" (String.length !n)

(*-----------------------------*)

exception Found of (int * int * int)
let () =
  try
    match
      Pcre.extract
        ~full_match:false
        ~pat:"^(o*)\\1{11}(o*)\\2{14}(o*)\\3{15}$"
        (String.make 281 'o')
    with
      | [| x; y; z |] -> raise (Found
                                  (String.length x,
                                   String.length y,
                                   String.length z))
      | _ -> raise Not_found
  with
    | Found (x, y, z) ->
        Printf.printf "One solution is: x=%d; y=%d; z=%d.\n"
          x y z
    | Not_found ->
        Printf.printf "No solution.\n"

(* One solution is: x=17; y=3; z=2. *)

(*-----------------------------*)

~pat:"^(o+)\\1{11}(o+)\\2{14}(o+)\\3{15}$"
(* One solution is: x=17; y=3; z=2. *)

~pat:"^(o*?)\\1{11}(o*)\\2{14}(o*)\\3{15}$"
(* One solution is: x=0; y=7; z=11. *)

~pat:"^(o+?)\\1{11}(o*)\\2{14}(o*)\\3{15}$"
(* One solution is: x=1; y=3; z=14. *)

Expressing AND, OR, and NOT in a Single Pattern

#directory "+pcre";;
#load "pcre.cma";;

let pat = input_line config_channel
let () = if Pcre.pmatch ~pat data then (* ... *) ()

(*-----------------------------*)

(* alpha OR beta *)
let regexp = Pcre.regexp "alpha|beta"

(* alpha AND beta *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?=.*alpha)(?=.*beta)"

(* alpha AND beta, no overlap *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "alpha.*beta|beta.*alpha"

(* NOT pat *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?:(?!pat).)*$"

(* NOT bad BUT good *)
let regexp = Pcre.regexp ~flags:[`DOTALL] "(?=(?:(?!bad).)*$)good"

(*-----------------------------*)

let () =
  if not (Pcre.pmatch ~rex:regexp text)
  then something ()

(*-----------------------------*)

let () =
  if (Pcre.pmatch ~rex:rexexp1 text) && (Pcre.pmatch ~rex:rexexp2 text)
  then something ()

(*-----------------------------*)

let () =
  if (Pcre.pmatch ~rex:rexexp1 text) || (Pcre.pmatch ~rex:rexexp2 text)
  then something ()

(*-----------------------------*)

#!/usr/bin/ocaml
(* minigrep - trivial grep *)
#directory "+pcre";;
#load "pcre.cma";;

let line_stream_of_channel channel =
  Stream.from
    (fun _ -> try Some (input_line channel) with End_of_file -> None)

let minigrep pat files =
  let rex =
    try Pcre.regexp pat
    with Pcre.BadPattern (msg, _) ->
      Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg;
      exit 1 in
  let process file =
    let channel = if file = "-" then stdin else open_in file in
    try
      Stream.iter
        (fun line -> if Pcre.pmatch ~rex line then print_endline line)
        (line_stream_of_channel channel);
      close_in channel
    with e ->
      close_in channel;
      raise e in
  List.iter process files

let () =
  match List.tl (Array.to_list Sys.argv) with
    | pat :: [] -> minigrep pat ["-"]
    | pat :: files -> minigrep pat files