(* We will use the Str library distributed with OCaml for regular expressions. * There are two ways to use the str library, building a top or passing it to ocaml. * Under Unix, you can create a new toplevel which has the Str module: * $ ocamlmktop -o strtop str.cma * $ ./strtop * Now you don't need to prefix the contents of the str module with Str. * The alternative is to pass str.cma as a parameter: * $ ocaml str.cma * Now you may refer to the contents of the str module by using Str. * Under Windows, if you are using ocamlwin.exe you can simply load Str: * # load "str.cma";; *) (* Str.search_forward returns an int or throws an exception if the pattern isn't found. * In Perl, the =~ operator returns null. Since these two values have different * types in OCaml, we cannot copy this behaviour directly. * Instead, we return an impossible index, -1 using try ... with. * Another method would be to define an =~ operator and use that directly: # let (=~) s re = Str.string_match (Str.regexp re) s 0;; val ( =~ ) : string -> string -> bool = <fun> # "abc" =~ "a";; - : bool = true # "bc" =~ "a";; - : bool = false * Don't underestimate the power of this. Many of the following examples could be * simplified by defining infix operators. *) try Str.search_forward (Str.regexp pattern) string 0; with Not_found -> -1;; try Str.replace_first (Str.regexp pattern) replacement string; with Not_found -> "";; (* ---------------------------- *) try (Str.search_forward (Str.regexp "sheep") meadow 0) > -1; with Not_found -> false;; (* true if meadow contains "sheep" *) try not ((Str.search_forward (Str.regexp "sheep") meadow 0) > -1); with Not_found -> true;; (* true if meadow doesn't contain "sheep" *) let meadow = try Str.replace_first (Str.regexp "old") "new" meadow; with Not_found -> meadow;; (* Replace "old" with "new" in meadow *) (* ---------------------------- *) try let temp = Str.search_forward (Str.regexp "\\bovines?\\b") meadow 0 in print_string "Here be sheep!"; with Not_found -> ();; (* ---------------------------- *) let string = "good food" in try Str.replace_first (Str.regexp "o*") "e" string; with Not_found -> string;; (* ---------------------------- *) (* There is no way to take command line parameters to ocaml that I know of. * You would first have to compile your OCaml program using ocamlc. *) (* ---------------------------- *) let rec match_num s start= if String.length s > 0 then try let temp = Str.search_forward (Str.regexp "[0123456789]+") s start in print_string (String.concat "" ("Found number " :: Str.matched_string s :: ["\n"])); match_num s (temp + 1); with Not_found -> (); else ();; (* ---------------------------- *) let rec match_group s start numbers= if String.length s > 0 then try let temp = (Str.search_forward (Str.regexp "[0123456789]+") s start) in let numbers = Str.matched_string s :: numbers in match_group s (temp + 1) numbers; with Not_found -> numbers; else numbers;; (* ---------------------------- *) let (=+) s re = let result = ref [] in let offset = ref 0 in while ((String.length s) > !offset) do try offset := 1 + (Str.search_forward (Str.regexp re) s !offset); result := !result @ [Str.matched_string s] @ []; with Not_found -> ignore (offset := String.length s) done; result;; let (=-) s re = let result = ref [] in let offset = ref 0 in while ((String.length s) > !offset) do try ignore (Str.search_forward (Str.regexp re) s !offset); offset := Str.match_end (); result := !result @ [Str.matched_string s] @ []; with Not_found -> ignore (offset := String.length s) done; result;; let digits = "123456789";; let yeslap = digits =+ "[1234567890][1234567890][1234567890]";; let nonlap = digits =- "[1234567890][1234567890][1234567890]";; print_string "Non-overlapping: "; List.iter (fun v -> print_string (v ^ " ")) !nonlap; print_string "\n";; (* Non-overlapping: 123 456 789 *) print_string "Overlapping: "; List.iter (fun v -> print_string (v ^ " ")) !yeslap; print_string "\n";; (* Overlapping: 123 234 345 456 567 678 789 *) (* ---------------------------- *) let index = ref 0;; let string = "And little lambs eat ivy";; try index := Str.search_forward (Str.regexp "l[^s]*s") string 0; with Not_found -> ();; print_string ("(" ^ (String.sub string 0 !index) ^ ") "); print_string ("(" ^ (Str.matched_string string) ^ ") "); print_string ("(" ^ (Str.string_after string 16) ^ ")\n");; (* (And ) (little lambs) ( eat ivy) *) |
#load "str.cma";; (* The Str module doesn't modify strings in place; you always get a copy when you perform a substitution. *) let dst = Str.global_replace (Str.regexp "this") "that" src (* Strip to basename. *) let progname = Str.replace_first (Str.regexp "^.*/") "" Sys.argv.(0) (* Make All Words Title-Cased. *) let capword = Str.global_substitute (Str.regexp "\\b.") (fun s -> String.uppercase (Str.matched_string s)) words (* /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1 *) let catpage = Str.replace_first (Str.regexp "man\\([0-9]\\)") "cat\\1" manpage (* Copy and substitute on all strings in a list. *) let bindirs = ["/usr/bin"; "/bin"; "/usr/local/bin"] let libdirs = List.map (fun s -> Str.replace_first (Str.regexp "bin") "lib" s) bindirs (* ["/usr/lib"; "/lib"; "/usr/local/lib"] *) |
(* Str can do a simple character range match, but it isn't very practical for matching alphabetic characters in general. *) #load "str.cma";; let () = if Str.string_match (Str.regexp "^[A-Za-z]+$") var 0 then print_endline "var is purely alphabetic" (* With Pcre, you can use UTF8 support and match characters with the letter property. *) #directory "+pcre";; #load "pcre.cma";; let () = if Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`UTF8] "^\\pL+$") var then print_endline "var is purely alphabetic" |
#load "str.cma";; (* Str's regexps lack a whitespace-matching pattern. Here is a substitute. *) let whitespace_chars = String.concat "" (List.map (String.make 1) [ Char.chr 9; (* HT *) Char.chr 10; (* LF *) Char.chr 11; (* VT *) Char.chr 12; (* FF *) Char.chr 13; (* CR *) Char.chr 32; (* space *) ]) let space = "[" ^ whitespace_chars ^ "]" let non_space = "[^" ^ whitespace_chars ^ "]" (* as many non-whitespace characters as possible *) let regexp = Str.regexp (non_space ^ "+") (* as many letters, apostrophes, and hyphens *) let regexp = Str.regexp "[A-Za-z'-]+" (* usually best *) let regexp = Str.regexp "\\b\\([A-Za-z]+\\)\\b" (* fails at ends or w/ punctuation *) let regexp = Str.regexp (space ^ "\\([A-Za-z]+\\)" ^ space) |
#!/usr/bin/ocaml (* resname - change all "foo.bar.com" style names in the input stream into "foo.bar.com [204.148.40.9]" (or whatever) instead *) #directory "+pcre";; #load "pcre.cma";; #load "unix.cma";; let regexp = Pcre.regexp ~flags:[`EXTENDED] " ( # capture the hostname in substring 1 (?: # these parens for grouping only (?! [-_] ) # lookahead for neither underscore nor dash [\\w-] + # hostname component \\. # and the domain dot ) + # now repeat that whole thing a bunch of times [A-Za-z] # next must be a letter [\\w-] + # now trailing domain part ) # end of substring 1 capture " let process line = print_endline (Pcre.substitute_substrings ~rex:regexp ~subst:(fun subs -> let name = Pcre.get_substring subs 1 in let addr = try Unix.string_of_inet_addr (Unix.gethostbyname name).Unix.h_addr_list.(0) with Not_found -> "???" in name ^ " [" ^ addr ^ "]") line) let () = try while true do let line = read_line () in process line done with End_of_file -> () (*-----------------------------*) let vars = Hashtbl.create 0 let () = Hashtbl.replace vars "name" "Bob"; Hashtbl.replace vars "flavor" "rhubarb" let () = print_endline (Pcre.substitute_substrings ~rex:(Pcre.regexp ~flags:[`EXTENDED] " \\# # a pound sign (\\w+) # the variable name \\# # another pound sign ") ~subst:(fun subs -> Hashtbl.find vars (Pcre.get_substring subs 1)) "Hello, #name#, would you like some #flavor# pie?") |
#load "str.cma";; let want = 3 let count = ref 0 let pond = "One fish two fish red fish blue fish" let regexp = Str.regexp_case_fold "\\([a-z]+\\)[ ]+fish\\b" exception Found of string let () = let start = ref 0 in try while true do ignore (Str.search_forward regexp pond !start); start := !start + String.length (Str.matched_string pond); incr count; if !count = want then raise (Found (Str.matched_group 1 pond)) done with | Found color -> Printf.printf "The third fish is a %s one.\n" color | Not_found -> Printf.printf "Only found %d fish!\n" !count (* The third fish is a red one. *) (*-----------------------------*) let colors = let start = ref 0 in let fish = ref [] in begin try while true do ignore (Str.search_forward regexp pond !start); start := !start + (String.length (Str.matched_string pond)); fish := (Str.matched_group 1 pond) :: !fish done; with Not_found -> () end; Array.of_list (List.rev !fish) let () = Printf.printf "The third fish in the pond is %s.\n" colors.(2) (* The third fish in the pond is red. *) (*-----------------------------*) let evens = let colors' = ref [] in Array.iteri (fun i color -> if i mod 2 = 1 then colors' := color :: !colors') colors; List.rev !colors' let () = Printf.printf "Even numbered fish are %s.\n" (String.concat " " evens) (* Even numbered fish are two blue. *) (*-----------------------------*) let () = let count = ref 0 in print_endline (Str.global_substitute (Str.regexp_case_fold "\\b\\([a-z]+\\)\\([ ]+fish\\b\\)") (fun s -> incr count; if !count = 4 then "sushi" ^ Str.matched_group 2 s else Str.matched_group 1 s ^ Str.matched_group 2 s) pond) (* One fish two fish red fish sushi fish *) (*-----------------------------*) let pond = "One fish two fish red fish blue fish swim here." let regexp = Str.regexp_case_fold "\\b\\([a-z]+\\)[ ]+fish\\b" let colors = let rec loop start acc = try ignore (Str.search_forward regexp pond start); loop (start + String.length (Str.matched_string pond)) (Str.matched_group 1 pond :: acc) with Not_found -> acc in loop 0 [] let color = List.hd colors let () = Printf.printf "Last fish is %s.\n" color (* Last fish is blue. *) |
#!/usr/bin/ocaml (* killtags - very bad html tag killer *) #load "str.cma";; let regexp = Str.regexp "<[^>]*>" let () = List.iter (fun filename -> let lines = ref [] in let in_channel = open_in filename in try begin try while true do lines := input_line in_channel :: !lines done with End_of_file -> () end; let contents = String.concat "\n" (List.rev !lines) in print_endline (String.concat "" (List.map (function | Str.Text s -> s | _ -> "") (Str.full_split regexp contents))); close_in in_channel with e -> close_in in_channel; raise e) (List.tl (Array.to_list Sys.argv)) (*-----------------------------*) #!/usr/bin/ocaml (* headerfy - change certain chapter headers to html *) #load "str.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let regexp = Str.regexp "^Chapter[\r\n\t ]+[0-9]+[\r\n\t ]*:[^\r\n]*" let headerfy chunk = String.concat "" (List.map (function | Str.Text s -> s | Str.Delim s -> "<H1>" ^ s ^ "</H1>") (Str.full_split regexp chunk)) let () = List.iter (fun filename -> let in_channel = open_in filename in try Stream.iter (fun para -> print_endline (headerfy para); print_newline ()) (paragraph_stream_of_channel in_channel); close_in in_channel with e -> close_in in_channel; raise e) (List.tl (Array.to_list Sys.argv)) |
#load "str.cma";; let chunks = let lines = ref [] in begin try while true do lines := input_line stdin :: !lines done with End_of_file -> () end; let contents = String.concat "\n" (List.rev !lines) in Str.full_split (Str.regexp "^\\.\\(Ch\\|Se\\|Ss\\)$") contents let () = Printf.printf "I read %d chunks.\n" (List.length chunks) |
#load "str.cma";; (* Creates a stream that produces ranges of items from another stream. Production of items starts when when (start_test count item) returns true and stops when (finish_test count item) returns true. Multiple ranges will be produced if start_test returns true again. The count starts at 1. Ranges are inclusive; the item that causes finish_test to return true will be produced. *) let stream_range start_test finish_test stream = let active = ref false in let count = ref 1 in let rec next i = match Stream.peek stream with | None -> None | Some item -> if not !active then begin if start_test !count item then (active := true; next i) else (Stream.junk stream; incr count; next i) end else begin if finish_test !count item then active := false; Stream.junk stream; incr count; Some item end in Stream.from next (* Creates a stream that produces items between a pair of indices. If start = 2 and finish = 4, items 2, 3, and 4 will be produced. The first item is number 1. *) let stream_range_numbers start finish stream = stream_range (fun count _ -> count = start) (fun count _ -> count = finish) stream (* Creates a stream that produces strings between a pair of regexps. The regexp will be tested using Str.string_match. *) let stream_range_patterns start finish stream = stream_range (fun _ line -> Str.string_match start line 0) (fun _ line -> Str.string_match finish line 0) stream (* Produce a stream of lines from an input channel. *) let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) (* Print lines 15 through 17 inclusive. *) let () = Stream.iter print_endline (stream_range_numbers 15 17 (line_stream_of_channel (open_in datafile))) (* Print out all <XMP> .. </XMP> displays from HTML doc. *) let () = Stream.iter print_endline (stream_range_patterns (Str.regexp ".*<XMP>") (Str.regexp ".*</XMP>") (line_stream_of_channel stdin)) (*-----------------------------*) let in_header = ref true let in_body = ref false let () = Stream.iter (fun line -> if !in_header && line = "" then (in_header := false; in_body := true) else begin (* do something with line *) end) (line_stream_of_channel stdin) (*-----------------------------*) module StringSet = Set.Make(String) let seen = ref StringSet.empty let email_regexp = Str.regexp "\\([^<>(),; \t]+@[^<>(),; \t]+\\)" let () = Stream.iter (fun line -> List.iter (function | Str.Delim email -> if not (StringSet.mem email !seen) then begin seen := StringSet.add email !seen; print_endline email; end | _ -> ()) (Str.full_split email_regexp line)) (stream_range_patterns (Str.regexp "^From:?[ \t]") (Str.regexp "^$") (line_stream_of_channel stdin)) |
let regexp_string_of_glob s = let i, buffer = ref (-1), Buffer.create (String.length s + 8) in let read () = try incr i; Some s.[!i] with Invalid_argument _ -> None in let write = Buffer.add_string buffer in let rec parse_glob () = match read () with | Some '*' -> write ".*"; parse_glob () | Some '?' -> write "."; parse_glob () | Some '[' -> parse_bracket "" | Some c -> write (Str.quote (String.make 1 c)); parse_glob () | None -> () and parse_bracket text = match read () with | Some '!' when text = "" -> parse_bracket "^" | Some ']' -> write ("[" ^ text ^ "]"); parse_glob () | Some c -> parse_bracket (text ^ (String.make 1 c)) | None -> write (Str.quote ("[" ^ text)) in write "^"; parse_glob (); write "$"; Buffer.contents buffer let regexp_of_glob s = Str.regexp (regexp_string_of_glob s) |
#load "str.cma";; let popstates = ["CO"; "ON"; "MI"; "WI"; "MN"] (* Naive version: Compile a regexp each time it is needed. *) let popgrep1 () = try begin while true do let line = input_line stdin in try List.iter (fun state -> if (Str.string_match (Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b")) line 0) then (print_endline line; raise Exit)) popstates with Exit -> () done end with End_of_file -> () (* First optimization: Compile the regexps in advance. *) let popgrep2 () = let popstate_regexps = List.map (fun state -> Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b")) popstates in try begin while true do let line = input_line stdin in try List.iter (fun regexp -> if (Str.string_match regexp line 0) then (print_endline line; raise Exit)) popstate_regexps with Exit -> () done end with End_of_file -> () (* Second optimization: Build a single regexp for all states. *) let popgrep3 () = let popstates_regexp = Str.regexp (".*\\b\\(" ^ (String.concat "\\|" (List.map Str.quote popstates)) ^ "\\)\\b") in try begin while true do let line = input_line stdin in if Str.string_match popstates_regexp line 0 then print_endline line done end with End_of_file -> () (* Speed tests with a 15,000 line input file: *) let () = popgrep1 () (* time: 13.670s *) let () = popgrep2 () (* time: 0.264s *) let () = popgrep3 () (* time: 0.123s *) |
#load "str.cma";; let () = while true do print_string "Pattern? "; flush stdout; let pattern = input_line stdin in try ignore (Str.regexp pattern) with Failure message -> Printf.printf "INVALID PATTERN: %s\n" message done (*-----------------------------*) let is_valid_pattern pattern = try ignore (Str.regexp pattern); true with Failure _ -> false (*-----------------------------*) #!/usr/bin/ocaml (* paragrep - trivial paragraph grepper *) #load "str.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let paragrep pat files = let regexp = begin try Str.regexp pat with Failure msg -> Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg; exit 1 end in let count = ref 0 in List.iter (fun file -> let channel = if file = "-" then stdin else open_in file in try Stream.iter (fun para -> incr count; try ignore (Str.search_forward regexp para 0); Printf.printf "%s %d: %s\n\n" file !count para with Not_found -> ()) (paragraph_stream_of_channel channel); close_in channel with e -> close_in channel; raise e) files let () = match List.tl (Array.to_list Sys.argv) with | pat :: [] -> paragrep pat ["-"] | pat :: files -> paragrep pat files | [] -> Printf.eprintf "usage: %s pat [files]\n" Sys.argv.(0) (*-----------------------------*) let safe_pat = Str.quote pat |
(* OCaml does not provide a way to change the locale, and PCRE does not appear to be sensitive to the default locale. Regardless, Str does not support locales, and PCRE only matches ASCII characters for \w and friends. This example instead demonstrates the use of PCRE's UTF-8 support to match words, and it does not use locales. *) #directory "+pcre";; #load "pcre.cma";; (* encoded as UTF-8 *) let name = "andreas k\xc3\xb6nig" (* the original regexp which is not Unicode-aware *) let ascii_regexp = Pcre.regexp "\\b(\\w+)\\b" (* a revised regexp which tests for Unicode letters and numbers *) let utf8_regexp = Pcre.regexp ~flags:[`UTF8] "([\\pL\\pN]+)" let () = List.iter (fun (enc, regexp) -> Printf.printf "%s names: %s\n" enc (String.concat " " (List.map String.capitalize (List.flatten (Array.to_list (Array.map Array.to_list (Pcre.extract_all ~full_match:false ~rex:regexp name))))))) ["ASCII", ascii_regexp; "UTF-8", utf8_regexp] (* ASCII names: Andreas K Nig UTF-8 names: Andreas König *) |
(* Calculates the Levenshtein, or edit distance, between two strings. *) let levenshtein s t = let n = String.length s in let m = String.length t in match (m, n) with | (m, 0) -> m | (0, n) -> n | (m, n) -> let d = Array.init (m + 1) (fun x -> x) in let x = ref 0 in for i = 0 to n - 1 do let e = ref (i + 1) in for j = 0 to m - 1 do let cost = if s.[i] = t.[j] then 0 else 1 in x := min (d.(j + 1) + 1) (* insertion *) (min (!e + 1) (* deletion *) (d.(j) + cost)); (* substitution *) d.(j) <- !e; e := !x done; d.(m) <- !x done; !x (* Determines if two strings are an approximate match. *) let amatch ?(percentage=20) s t = levenshtein s t * 100 / String.length s <= percentage let () = let dict = open_in "/usr/dict/words" in try while true do let word = input_line dict in if amatch "balast" word then print_endline word done with End_of_file -> close_in dict (* ballast blast *) |
#directory "+pcre";; #load "pcre.cma";; let s = "12 345 hello 6 7world89 10" let rex = Pcre.regexp "(\\d+)" let () = let subs = ref (Pcre.exec ~rex s) in try while true do Printf.printf "Found %s\n" (Pcre.get_substring !subs 1); subs := Pcre.next_match ~rex !subs done with Not_found -> () (*-----------------------------*) let () = let n = " 49 here" in let n = Pcre.replace ~pat:"\\G " ~templ:"0" n in print_endline n (* 00049 here *) (*-----------------------------*) let s = "3,4,5,9,120" let rex = Pcre.regexp "\\G,?(\\d+)" let () = let subs = ref (Pcre.exec ~rex s) in try while true do Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1); subs := Pcre.next_match ~rex !subs done with Not_found -> () (*-----------------------------*) let s = "The year 1752 lost 10 days on the 3rd of September" let rex = Pcre.regexp "(\\d+)" let subs = ref (Pcre.exec ~rex s) let () = try while true do Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1); subs := Pcre.next_match ~rex !subs done with Not_found -> () let () = let rex = Pcre.regexp "\\G(\\S+)" in subs := Pcre.next_match ~rex !subs; Printf.printf "Found %s after the last number.\n" (Pcre.get_substring !subs 1) (* Found number 1752 Found number 10 Found number 3 Found rd after the last number. *) (*-----------------------------*) let () = match Pcre.get_substring_ofs !subs 1 with | (start, finish) -> Printf.printf "The position in 's' is %d..%d\n" start finish (* The position in 's' is 35..37 *) |
let s = "Even <TT>vi</TT> can edit <TT>troff</TT> effectively." (* The Str library does not support non-greedy matches. In many cases, you can turn a non-greedy match into a greedy one, however: *) #load "str.cma";; let () = print_endline (Str.global_replace (Str.regexp "<.*>") "" s) (* Even effectively. *) let () = print_endline (Str.global_replace (Str.regexp "<[^>]*>") "" s) (* Even vi can edit troff effectively. *) (* If you need non-greedy matches, you'll want to use PCRE instead: *) #directory "+pcre";; #load "pcre.cma";; let () = print_endline (Pcre.replace ~pat:"<.*?>" ~templ:"" s) (* Even vi can edit troff effectively. *) (* Non-greedy matches don't always work the way you expect: *) let s = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>" let rex = Pcre.regexp "<b><i>(.*?)</i></b>" let () = print_endline (Pcre.extract ~rex s).(1) (* this</i> and <i>that</i> are important</b> Oh, <b><i>me too! *) (* One solution is to use a non-grouping negative lookahead assertion: *) let rex = Pcre.regexp "<b><i>((?:(?!</b>|</i>).)*)</i></b>" let () = print_endline (Pcre.extract ~rex s).(1) (* me too! *) (* If performance is important, here is a faster technique: *) let rex = Pcre.regexp ~flags:[`DOTALL; `EXTENDED] " <b><i> [^<]* # stuff not possibly bad, and not possibly the end. (?: # at this point, we can have '<' if not part of something bad (?! </?[ib]> ) # what we can't have < # okay, so match the '<' [^<]* # and continue with more safe stuff ) * </i></b> " let () = print_endline (Pcre.extract ~rex s).(0) (* <b><i>me too!</i></b> *) |
#directory "+pcre";; #load "pcre.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let find_dup_words files = let rex = Pcre.regexp ~flags:[`CASELESS; `EXTENDED] " \\b # start at a word boundary (begin letters) (\\S+) # find chunk of non-whitespace \\b # until another word boundary (end letters) ( \\s+ # separated by some whitespace \\1 # and that very same chunk again \\b # until another word boundary ) + # one or more sets of those " in let count = ref 0 in List.iter (fun file -> let channel = if file = "-" then stdin else open_in file in try Stream.iter (fun para -> incr count; try let subs = ref (Pcre.exec ~rex para) in while true do Printf.printf "dup word '%s' at paragraph %d.\n" (Pcre.get_substring !subs 1) !count; flush stdout; subs := Pcre.next_match ~rex !subs; done with Not_found -> ()) (paragraph_stream_of_channel channel); close_in channel with e -> close_in channel; raise e) files let () = match List.tl (Array.to_list Sys.argv) with | [] -> find_dup_words ["-"] | files -> find_dup_words files (*-----------------------------*) (* This is a test test of the duplicate word finder. *) (* dup word 'test' at paragraph 1. *) (*-----------------------------*) let a = "nobody" let b = "bodysnatcher" let () = try let subs = Pcre.exec ~pat:"^(\\w+)(\\w+) \\2(\\w+)$" (a ^ " " ^ b) in Printf.printf "%s overlaps in %s-%s-%s\n" (Pcre.get_substring subs 2) (Pcre.get_substring subs 1) (Pcre.get_substring subs 2) (Pcre.get_substring subs 3) with Not_found -> () (* body overlaps in no-body-snatcher *) (*-----------------------------*) #!/usr/bin/ocaml (* prime_pattern -- find prime factors of argument using pattern matching *) #directory "+pcre";; #load "pcre.cma";; let arg = try int_of_string Sys.argv.(1) with _ -> 0 let n = ref (String.make arg 'o') let rex = Pcre.regexp "^(oo+?)\\1+$" let templ = "o" let () = try while true do let pat = Pcre.get_substring (Pcre.exec ~rex !n) 1 in Printf.printf "%d " (String.length pat); n := Pcre.replace ~pat ~templ !n done with Not_found -> Printf.printf "%d\n" (String.length !n) (*-----------------------------*) exception Found of (int * int * int) let () = try match Pcre.extract ~full_match:false ~pat:"^(o*)\\1{11}(o*)\\2{14}(o*)\\3{15}$" (String.make 281 'o') with | [| x; y; z |] -> raise (Found (String.length x, String.length y, String.length z)) | _ -> raise Not_found with | Found (x, y, z) -> Printf.printf "One solution is: x=%d; y=%d; z=%d.\n" x y z | Not_found -> Printf.printf "No solution.\n" (* One solution is: x=17; y=3; z=2. *) (*-----------------------------*) ~pat:"^(o+)\\1{11}(o+)\\2{14}(o+)\\3{15}$" (* One solution is: x=17; y=3; z=2. *) ~pat:"^(o*?)\\1{11}(o*)\\2{14}(o*)\\3{15}$" (* One solution is: x=0; y=7; z=11. *) ~pat:"^(o+?)\\1{11}(o*)\\2{14}(o*)\\3{15}$" (* One solution is: x=1; y=3; z=14. *) |
#directory "+pcre";; #load "pcre.cma";; let pat = input_line config_channel let () = if Pcre.pmatch ~pat data then (* ... *) () (*-----------------------------*) (* alpha OR beta *) let regexp = Pcre.regexp "alpha|beta" (* alpha AND beta *) let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?=.*alpha)(?=.*beta)" (* alpha AND beta, no overlap *) let regexp = Pcre.regexp ~flags:[`DOTALL] "alpha.*beta|beta.*alpha" (* NOT pat *) let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?:(?!pat).)*$" (* NOT bad BUT good *) let regexp = Pcre.regexp ~flags:[`DOTALL] "(?=(?:(?!bad).)*$)good" (*-----------------------------*) let () = if not (Pcre.pmatch ~rex:regexp text) then something () (*-----------------------------*) let () = if (Pcre.pmatch ~rex:rexexp1 text) && (Pcre.pmatch ~rex:rexexp2 text) then something () (*-----------------------------*) let () = if (Pcre.pmatch ~rex:rexexp1 text) || (Pcre.pmatch ~rex:rexexp2 text) then something () (*-----------------------------*) #!/usr/bin/ocaml (* minigrep - trivial grep *) #directory "+pcre";; #load "pcre.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let minigrep pat files = let rex = try Pcre.regexp pat with Pcre.BadPattern (msg, _) -> Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg; exit 1 in let process file = let channel = if file = "-" then stdin else open_in file in try Stream.iter (fun line -> if Pcre.pmatch ~rex line then print_endline line) (line_stream_of_channel channel); close_in channel with e -> close_in channel; raise e in List.iter process files let () = match List.tl (Array.to_list Sys.argv) with | pat :: [] -> minigrep pat ["-"] | pat :: files -> minigrep pat files |