(* We will use the Str library distributed with OCaml for regular expressions. * There are two ways to use the str library, building a top or passing it to ocaml. * Under Unix, you can create a new toplevel which has the Str module: * $ ocamlmktop -o strtop str.cma * $ ./strtop * Now you don't need to prefix the contents of the str module with Str. * The alternative is to pass str.cma as a parameter: * $ ocaml str.cma * Now you may refer to the contents of the str module by using Str. * Under Windows, if you are using ocamlwin.exe you can simply load Str: * # load "str.cma";; *) (* Str.search_forward returns an int or throws an exception if the pattern isn't found. * In Perl, the =~ operator returns null. Since these two values have different * types in OCaml, we cannot copy this behaviour directly. * Instead, we return an impossible index, -1 using try ... with. * Another method would be to define an =~ operator and use that directly: # let (=~) s re = Str.string_match (Str.regexp re) s 0;; val ( =~ ) : string -> string -> bool = <fun> # "abc" =~ "a";; - : bool = true # "bc" =~ "a";; - : bool = false * Don't underestimate the power of this. Many of the following examples could be * simplified by defining infix operators. *) try Str.search_forward (Str.regexp pattern) string 0; with Not_found -> -1;; try Str.replace_first (Str.regexp pattern) replacement string; with Not_found -> "";; (*-----------------------------*) try (Str.search_forward (Str.regexp "sheep") meadow 0) > -1; with Not_found -> false;; (* true if meadow contains "sheep" *) try not ((Str.search_forward (Str.regexp "sheep") meadow 0) > -1); with Not_found -> true;; (* true if meadow doesn't contain "sheep" *) let meadow = try Str.replace_first (Str.regexp "old") "new" meadow; with Not_found -> meadow;; (* Replace "old" with "new" in meadow *) (*-----------------------------*) try let temp = Str.search_forward (Str.regexp "\\bovines?\\b") meadow 0 in print_string "Here be sheep!"; with Not_found -> ();; (*-----------------------------*) let string = "good food" in try Str.replace_first (Str.regexp "o*") "e" string; with Not_found -> string;; (*-----------------------------*) (* There is no way to take command line parameters to ocaml that I know of. * You would first have to compile your OCaml program using ocamlc. *) (*-----------------------------*) let rec match_num s start= if String.length s > 0 then try let temp = Str.search_forward (Str.regexp "[0123456789]+") s start in print_string (String.concat "" ("Found number " :: Str.matched_string s :: ["\n"])); match_num s (temp + 1); with Not_found -> (); else ();; (*-----------------------------*) let rec match_group s start numbers= if String.length s > 0 then try let temp = (Str.search_forward (Str.regexp "[0123456789]+") s start) in let numbers = Str.matched_string s :: numbers in match_group s (temp + 1) numbers; with Not_found -> numbers; else numbers;; (*-----------------------------*) let (=+) s re = let result = ref [] in let offset = ref 0 in while ((String.length s) > !offset) do try offset := 1 + (Str.search_forward (Str.regexp re) s !offset); result := !result @ [Str.matched_string s] @ []; with Not_found -> ignore (offset := String.length s) done; result;; let (=-) s re = let result = ref [] in let offset = ref 0 in while ((String.length s) > !offset) do try ignore (Str.search_forward (Str.regexp re) s !offset); offset := Str.match_end (); result := !result @ [Str.matched_string s] @ []; with Not_found -> ignore (offset := String.length s) done; result;; let digits = "123456789";; let yeslap = digits =+ "[1234567890][1234567890][1234567890]";; let nonlap = digits =- "[1234567890][1234567890][1234567890]";; print_string "Non-overlapping: "; List.iter (fun v -> print_string (v ^ " ")) !nonlap; print_string "\n";; (* Non-overlapping: 123 456 789 *) print_string "Overlapping: "; List.iter (fun v -> print_string (v ^ " ")) !yeslap; print_string "\n";; (* Overlapping: 123 234 345 456 567 678 789 *) (*-----------------------------*) let index = ref 0;; let string = "And little lambs eat ivy";; try index := Str.search_forward (Str.regexp "l[^s]*s") string 0; with Not_found -> ();; print_string ("(" ^ (String.sub string 0 !index) ^ ") "); print_string ("(" ^ (Str.matched_string string) ^ ") "); print_string ("(" ^ (Str.string_after string 16) ^ ")\n");; (* (And ) (little lambs) ( eat ivy) *) |
#load "str.cma";; (* The Str module doesn't modify strings in place; you always get a copy when you perform a substitution. *) let dst = Str.global_replace (Str.regexp "this") "that" src (* Strip to basename. *) let progname = Str.replace_first (Str.regexp "^.*/") "" Sys.argv.(0) (* Make All Words Title-Cased. *) let capword = Str.global_substitute (Str.regexp "\\b.") (fun s -> String.uppercase (Str.matched_string s)) words (* /usr/man/man3/foo.1 changes to /usr/man/cat3/foo.1 *) let catpage = Str.replace_first (Str.regexp "man\\([0-9]\\)") "cat\\1" manpage (* Copy and substitute on all strings in a list. *) let bindirs = ["/usr/bin"; "/bin"; "/usr/local/bin"] let libdirs = List.map (fun s -> Str.replace_first (Str.regexp "bin") "lib" s) bindirs (* ["/usr/lib"; "/lib"; "/usr/local/lib"] *) |
(* Str can do a simple character range match, but it isn't very practical for matching alphabetic characters in general. *) #load "str.cma";; let () = if Str.string_match (Str.regexp "^[A-Za-z]+$") var 0 then print_endline "var is purely alphabetic" (* With Pcre, you can use UTF8 support and match characters with the letter property. *) #directory "+pcre";; #load "pcre.cma";; let () = if Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`UTF8] "^\\pL+$") var then print_endline "var is purely alphabetic" |
#load "str.cma";; (* Str's regexps lack a whitespace-matching pattern. Here is a substitute. *) let whitespace_chars = String.concat "" (List.map (String.make 1) [ Char.chr 9; (* HT *) Char.chr 10; (* LF *) Char.chr 11; (* VT *) Char.chr 12; (* FF *) Char.chr 13; (* CR *) Char.chr 32; (* space *) ]) let space = "[" ^ whitespace_chars ^ "]" let non_space = "[^" ^ whitespace_chars ^ "]" (* as many non-whitespace characters as possible *) let regexp = Str.regexp (non_space ^ "+") (* as many letters, apostrophes, and hyphens *) let regexp = Str.regexp "[A-Za-z'-]+" (* usually best *) let regexp = Str.regexp "\\b\\([A-Za-z]+\\)\\b" (* fails at ends or w/ punctuation *) let regexp = Str.regexp (space ^ "\\([A-Za-z]+\\)" ^ space) |
#!/usr/bin/ocaml (* resname - change all "foo.bar.com" style names in the input stream into "foo.bar.com [204.148.40.9]" (or whatever) instead *) #directory "+pcre";; #load "pcre.cma";; #load "unix.cma";; let regexp = Pcre.regexp ~flags:[`EXTENDED] " ( # capture the hostname in substring 1 (?: # these parens for grouping only (?! [-_] ) # lookahead for neither underscore nor dash [\\w-] + # hostname component \\. # and the domain dot ) + # now repeat that whole thing a bunch of times [A-Za-z] # next must be a letter [\\w-] + # now trailing domain part ) # end of substring 1 capture " let process line = print_endline (Pcre.substitute_substrings ~rex:regexp ~subst:(fun subs -> let name = Pcre.get_substring subs 1 in let addr = try Unix.string_of_inet_addr (Unix.gethostbyname name).Unix.h_addr_list.(0) with Not_found -> "???" in name ^ " [" ^ addr ^ "]") line) let () = try while true do let line = read_line () in process line done with End_of_file -> () (*-----------------------------*) let vars = Hashtbl.create 0 let () = Hashtbl.replace vars "name" "Bob"; Hashtbl.replace vars "flavor" "rhubarb" let () = print_endline (Pcre.substitute_substrings ~rex:(Pcre.regexp ~flags:[`EXTENDED] " \\# # a pound sign (\\w+) # the variable name \\# # another pound sign ") ~subst:(fun subs -> Hashtbl.find vars (Pcre.get_substring subs 1)) "Hello, #name#, would you like some #flavor# pie?") |
#load "str.cma";; let want = 3 let count = ref 0 let pond = "One fish two fish red fish blue fish" let regexp = Str.regexp_case_fold "\\([a-z]+\\)[ ]+fish\\b" exception Found of string let () = let start = ref 0 in try while true do ignore (Str.search_forward regexp pond !start); start := !start + String.length (Str.matched_string pond); incr count; if !count = want then raise (Found (Str.matched_group 1 pond)) done with | Found color -> Printf.printf "The third fish is a %s one.\n" color | Not_found -> Printf.printf "Only found %d fish!\n" !count (* The third fish is a red one. *) (*-----------------------------*) let colors = let start = ref 0 in let fish = ref [] in begin try while true do ignore (Str.search_forward regexp pond !start); start := !start + (String.length (Str.matched_string pond)); fish := (Str.matched_group 1 pond) :: !fish done; with Not_found -> () end; Array.of_list (List.rev !fish) let () = Printf.printf "The third fish in the pond is %s.\n" colors.(2) (* The third fish in the pond is red. *) (*-----------------------------*) let evens = let colors' = ref [] in Array.iteri (fun i color -> if i mod 2 = 1 then colors' := color :: !colors') colors; List.rev !colors' let () = Printf.printf "Even numbered fish are %s.\n" (String.concat " " evens) (* Even numbered fish are two blue. *) (*-----------------------------*) let () = let count = ref 0 in print_endline (Str.global_substitute (Str.regexp_case_fold "\\b\\([a-z]+\\)\\([ ]+fish\\b\\)") (fun s -> incr count; if !count = 4 then "sushi" ^ Str.matched_group 2 s else Str.matched_group 1 s ^ Str.matched_group 2 s) pond) (* One fish two fish red fish sushi fish *) (*-----------------------------*) let pond = "One fish two fish red fish blue fish swim here." let regexp = Str.regexp_case_fold "\\b\\([a-z]+\\)[ ]+fish\\b" let colors = let rec loop start acc = try ignore (Str.search_forward regexp pond start); loop (start + String.length (Str.matched_string pond)) (Str.matched_group 1 pond :: acc) with Not_found -> acc in loop 0 [] let color = List.hd colors let () = Printf.printf "Last fish is %s.\n" color (* Last fish is blue. *) |
#!/usr/bin/ocaml (* killtags - very bad html tag killer *) #load "str.cma";; let regexp = Str.regexp "<[^>]*>" let () = List.iter (fun filename -> let lines = ref [] in let in_channel = open_in filename in try begin try while true do lines := input_line in_channel :: !lines done with End_of_file -> () end; let contents = String.concat "\n" (List.rev !lines) in print_endline (String.concat "" (List.map (function | Str.Text s -> s | _ -> "") (Str.full_split regexp contents))); close_in in_channel with e -> close_in in_channel; raise e) (List.tl (Array.to_list Sys.argv)) (*-----------------------------*) #!/usr/bin/ocaml (* headerfy - change certain chapter headers to html *) #load "str.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let regexp = Str.regexp "^Chapter[\r\n\t ]+[0-9]+[\r\n\t ]*:[^\r\n]*" let headerfy chunk = String.concat "" (List.map (function | Str.Text s -> s | Str.Delim s -> "<H1>" ^ s ^ "</H1>") (Str.full_split regexp chunk)) let () = List.iter (fun filename -> let in_channel = open_in filename in try Stream.iter (fun para -> print_endline (headerfy para); print_newline ()) (paragraph_stream_of_channel in_channel); close_in in_channel with e -> close_in in_channel; raise e) (List.tl (Array.to_list Sys.argv)) |
#load "str.cma";; let chunks = let lines = ref [] in begin try while true do lines := input_line stdin :: !lines done with End_of_file -> () end; let contents = String.concat "\n" (List.rev !lines) in Str.full_split (Str.regexp "^\\.\\(Ch\\|Se\\|Ss\\)$") contents let () = Printf.printf "I read %d chunks.\n" (List.length chunks) |
#load "str.cma";; (* Creates a stream that produces ranges of items from another stream. Production of items starts when when (start_test count item) returns true and stops when (finish_test count item) returns true. Multiple ranges will be produced if start_test returns true again. The count starts at 1. Ranges are inclusive; the item that causes finish_test to return true will be produced. *) let stream_range start_test finish_test stream = let active = ref false in let count = ref 1 in let rec next i = match Stream.peek stream with | None -> None | Some item -> if not !active then begin if start_test !count item then (active := true; next i) else (Stream.junk stream; incr count; next i) end else begin if finish_test !count item then active := false; Stream.junk stream; incr count; Some item end in Stream.from next (* Creates a stream that produces items between a pair of indices. If start = 2 and finish = 4, items 2, 3, and 4 will be produced. The first item is number 1. *) let stream_range_numbers start finish stream = stream_range (fun count _ -> count = start) (fun count _ -> count = finish) stream (* Creates a stream that produces strings between a pair of regexps. The regexp will be tested using Str.string_match. *) let stream_range_patterns start finish stream = stream_range (fun _ line -> Str.string_match start line 0) (fun _ line -> Str.string_match finish line 0) stream (* Produce a stream of lines from an input channel. *) let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) (* Print lines 15 through 17 inclusive. *) let () = Stream.iter print_endline (stream_range_numbers 15 17 (line_stream_of_channel (open_in datafile))) (* Print out all <XMP> .. </XMP> displays from HTML doc. *) let () = Stream.iter print_endline (stream_range_patterns (Str.regexp ".*<XMP>") (Str.regexp ".*</XMP>") (line_stream_of_channel stdin)) (*-----------------------------*) let in_header = ref true let in_body = ref false let () = Stream.iter (fun line -> if !in_header && line = "" then (in_header := false; in_body := true) else begin (* do something with line *) end) (line_stream_of_channel stdin) (*-----------------------------*) module StringSet = Set.Make(String) let seen = ref StringSet.empty let email_regexp = Str.regexp "\\([^<>(),; \t]+@[^<>(),; \t]+\\)" let () = Stream.iter (fun line -> List.iter (function | Str.Delim email -> if not (StringSet.mem email !seen) then begin seen := StringSet.add email !seen; print_endline email; end | _ -> ()) (Str.full_split email_regexp line)) (stream_range_patterns (Str.regexp "^From:?[ \t]") (Str.regexp "^$") (line_stream_of_channel stdin)) |
#load "str.cma";; let regexp_string_of_glob s = let i, buffer = ref (-1), Buffer.create (String.length s + 8) in let read () = incr i; if !i < String.length s then Some s.[!i] else None in let write = Buffer.add_string buffer in let rec parse_glob () = match read () with | Some '*' -> write ".*"; parse_glob () | Some '?' -> write "."; parse_glob () | Some '[' -> parse_bracket "" | Some c -> write (Str.quote (String.make 1 c)); parse_glob () | None -> () and parse_bracket text = match read () with | Some '!' when text = "" -> parse_bracket "^" | Some ']' -> write ("[" ^ text ^ "]"); parse_glob () | Some c -> parse_bracket (text ^ (String.make 1 c)) | None -> write (Str.quote ("[" ^ text)) in write "^"; parse_glob (); write "$"; Buffer.contents buffer let regexp_of_glob s = Str.regexp (regexp_string_of_glob s) let regexp_of_glob_case_fold s = Str.regexp_case_fold (regexp_string_of_glob s) |
#load "str.cma";; let popstates = ["CO"; "ON"; "MI"; "WI"; "MN"] (* Naive version: Compile a regexp each time it is needed. *) let popgrep1 () = try begin while true do let line = input_line stdin in try List.iter (fun state -> if (Str.string_match (Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b")) line 0) then (print_endline line; raise Exit)) popstates with Exit -> () done end with End_of_file -> () (* First optimization: Compile the regexps in advance. *) let popgrep2 () = let popstate_regexps = List.map (fun state -> Str.regexp (".*\\b" ^ (Str.quote state) ^ "\\b")) popstates in try begin while true do let line = input_line stdin in try List.iter (fun regexp -> if (Str.string_match regexp line 0) then (print_endline line; raise Exit)) popstate_regexps with Exit -> () done end with End_of_file -> () (* Second optimization: Build a single regexp for all states. *) let popgrep3 () = let popstates_regexp = Str.regexp (".*\\b\\(" ^ (String.concat "\\|" (List.map Str.quote popstates)) ^ "\\)\\b") in try begin while true do let line = input_line stdin in if Str.string_match popstates_regexp line 0 then print_endline line done end with End_of_file -> () (* Speed tests with a 15,000 line input file: *) let () = popgrep1 () (* time: 13.670s *) let () = popgrep2 () (* time: 0.264s *) let () = popgrep3 () (* time: 0.123s *) |
#load "str.cma";; let () = while true do print_string "Pattern? "; flush stdout; let pattern = input_line stdin in try ignore (Str.regexp pattern) with Failure message -> Printf.printf "INVALID PATTERN: %s\n" message done (*-----------------------------*) let is_valid_pattern pattern = try ignore (Str.regexp pattern); true with Failure _ -> false (*-----------------------------*) #!/usr/bin/ocaml (* paragrep - trivial paragraph grepper *) #load "str.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let paragrep pat files = let regexp = begin try Str.regexp pat with Failure msg -> Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg; exit 1 end in let count = ref 0 in List.iter (fun file -> let channel = if file = "-" then stdin else open_in file in try Stream.iter (fun para -> incr count; try ignore (Str.search_forward regexp para 0); Printf.printf "%s %d: %s\n\n" file !count para with Not_found -> ()) (paragraph_stream_of_channel channel); close_in channel with e -> close_in channel; raise e) files let () = match List.tl (Array.to_list Sys.argv) with | pat :: [] -> paragrep pat ["-"] | pat :: files -> paragrep pat files | [] -> Printf.eprintf "usage: %s pat [files]\n" Sys.argv.(0) (*-----------------------------*) let safe_pat = Str.quote pat |
(* OCaml does not provide a way to change the locale, and PCRE does not appear to be sensitive to the default locale. Regardless, Str does not support locales, and PCRE only matches ASCII characters for \w and friends. This example instead demonstrates the use of PCRE's UTF-8 support to match words, and it does not use locales. *) #directory "+pcre";; #load "pcre.cma";; (* encoded as UTF-8 *) let name = "andreas k\xc3\xb6nig" (* the original regexp which is not Unicode-aware *) let ascii_regexp = Pcre.regexp "\\b(\\w+)\\b" (* a revised regexp which tests for Unicode letters and numbers *) let utf8_regexp = Pcre.regexp ~flags:[`UTF8] "([\\pL\\pN]+)" let () = List.iter (fun (enc, regexp) -> Printf.printf "%s names: %s\n" enc (String.concat " " (List.map String.capitalize (List.flatten (Array.to_list (Array.map Array.to_list (Pcre.extract_all ~full_match:false ~rex:regexp name))))))) ["ASCII", ascii_regexp; "UTF-8", utf8_regexp] (* ASCII names: Andreas K Nig UTF-8 names: Andreas König *) |
(* Calculates the Levenshtein, or edit distance, between two strings. *) let levenshtein s t = let n = String.length s in let m = String.length t in match (m, n) with | (m, 0) -> m | (0, n) -> n | (m, n) -> let d = Array.init (m + 1) (fun x -> x) in let x = ref 0 in for i = 0 to n - 1 do let e = ref (i + 1) in for j = 0 to m - 1 do let cost = if s.[i] = t.[j] then 0 else 1 in x := min (d.(j + 1) + 1) (* insertion *) (min (!e + 1) (* deletion *) (d.(j) + cost)); (* substitution *) d.(j) <- !e; e := !x done; d.(m) <- !x done; !x (* Determines if two strings are an approximate match. *) let amatch ?(percentage=20) s t = levenshtein s t * 100 / String.length s <= percentage let () = let dict = open_in "/usr/dict/words" in try while true do let word = input_line dict in if amatch "balast" word then print_endline word done with End_of_file -> close_in dict (* ballast blast *) |
#directory "+pcre";; #load "pcre.cma";; let s = "12 345 hello 6 7world89 10" let rex = Pcre.regexp "(\\d+)" let () = let subs = ref (Pcre.exec ~rex s) in try while true do Printf.printf "Found %s\n" (Pcre.get_substring !subs 1); subs := Pcre.next_match ~rex !subs done with Not_found -> () (*-----------------------------*) let () = let n = " 49 here" in let n = Pcre.replace ~pat:"\\G " ~templ:"0" n in print_endline n (* 00049 here *) (*-----------------------------*) let s = "3,4,5,9,120" let rex = Pcre.regexp "\\G,?(\\d+)" let () = let subs = ref (Pcre.exec ~rex s) in try while true do Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1); subs := Pcre.next_match ~rex !subs done with Not_found -> () (*-----------------------------*) let s = "The year 1752 lost 10 days on the 3rd of September" let rex = Pcre.regexp "(\\d+)" let subs = ref (Pcre.exec ~rex s) let () = try while true do Printf.printf "Found number %s\n" (Pcre.get_substring !subs 1); subs := Pcre.next_match ~rex !subs done with Not_found -> () let () = let rex = Pcre.regexp "\\G(\\S+)" in subs := Pcre.next_match ~rex !subs; Printf.printf "Found %s after the last number.\n" (Pcre.get_substring !subs 1) (* Found number 1752 Found number 10 Found number 3 Found rd after the last number. *) (*-----------------------------*) let () = match Pcre.get_substring_ofs !subs 1 with | (start, finish) -> Printf.printf "The position in 's' is %d..%d\n" start finish (* The position in 's' is 35..37 *) |
let s = "Even <TT>vi</TT> can edit <TT>troff</TT> effectively." (* The Str library does not support non-greedy matches. In many cases, you can turn a non-greedy match into a greedy one, however: *) #load "str.cma";; let () = print_endline (Str.global_replace (Str.regexp "<.*>") "" s) (* Even effectively. *) let () = print_endline (Str.global_replace (Str.regexp "<[^>]*>") "" s) (* Even vi can edit troff effectively. *) (* If you need non-greedy matches, you'll want to use PCRE instead: *) #directory "+pcre";; #load "pcre.cma";; let () = print_endline (Pcre.replace ~pat:"<.*?>" ~templ:"" s) (* Even vi can edit troff effectively. *) (* Non-greedy matches don't always work the way you expect: *) let s = "<b><i>this</i> and <i>that</i> are important</b> Oh, <b><i>me too!</i></b>" let rex = Pcre.regexp "<b><i>(.*?)</i></b>" let () = print_endline (Pcre.extract ~rex s).(1) (* this</i> and <i>that</i> are important</b> Oh, <b><i>me too! *) (* One solution is to use a non-grouping negative lookahead assertion: *) let rex = Pcre.regexp "<b><i>((?:(?!</b>|</i>).)*)</i></b>" let () = print_endline (Pcre.extract ~rex s).(1) (* me too! *) (* If performance is important, here is a faster technique: *) let rex = Pcre.regexp ~flags:[`DOTALL; `EXTENDED] " <b><i> [^<]* # stuff not possibly bad, and not possibly the end. (?: # at this point, we can have '<' if not part of something bad (?! </?[ib]> ) # what we can't have < # okay, so match the '<' [^<]* # and continue with more safe stuff ) * </i></b> " let () = print_endline (Pcre.extract ~rex s).(0) (* <b><i>me too!</i></b> *) |
#directory "+pcre";; #load "pcre.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let paragraph_stream_of_channel channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some "", [] -> Stream.junk lines; next para_lines i | Some "", _ | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) let find_dup_words files = let rex = Pcre.regexp ~flags:[`CASELESS; `EXTENDED] " \\b # start at a word boundary (begin letters) (\\S+) # find chunk of non-whitespace \\b # until another word boundary (end letters) ( \\s+ # separated by some whitespace \\1 # and that very same chunk again \\b # until another word boundary ) + # one or more sets of those " in let count = ref 0 in List.iter (fun file -> let channel = if file = "-" then stdin else open_in file in try Stream.iter (fun para -> incr count; try let subs = ref (Pcre.exec ~rex para) in while true do Printf.printf "dup word '%s' at paragraph %d.\n" (Pcre.get_substring !subs 1) !count; flush stdout; subs := Pcre.next_match ~rex !subs; done with Not_found -> ()) (paragraph_stream_of_channel channel); close_in channel with e -> close_in channel; raise e) files let () = match List.tl (Array.to_list Sys.argv) with | [] -> find_dup_words ["-"] | files -> find_dup_words files (*-----------------------------*) (* This is a test test of the duplicate word finder. *) (* dup word 'test' at paragraph 1. *) (*-----------------------------*) let a = "nobody" let b = "bodysnatcher" let () = try let subs = Pcre.exec ~pat:"^(\\w+)(\\w+) \\2(\\w+)$" (a ^ " " ^ b) in Printf.printf "%s overlaps in %s-%s-%s\n" (Pcre.get_substring subs 2) (Pcre.get_substring subs 1) (Pcre.get_substring subs 2) (Pcre.get_substring subs 3) with Not_found -> () (* body overlaps in no-body-snatcher *) (*-----------------------------*) #!/usr/bin/ocaml (* prime_pattern -- find prime factors of argument using pattern matching *) #directory "+pcre";; #load "pcre.cma";; let arg = try int_of_string Sys.argv.(1) with _ -> 0 let n = ref (String.make arg 'o') let rex = Pcre.regexp "^(oo+?)\\1+$" let templ = "o" let () = try while true do let pat = Pcre.get_substring (Pcre.exec ~rex !n) 1 in Printf.printf "%d " (String.length pat); n := Pcre.replace ~pat ~templ !n done with Not_found -> Printf.printf "%d\n" (String.length !n) (*-----------------------------*) exception Found of (int * int * int) let () = try match Pcre.extract ~full_match:false ~pat:"^(o*)\\1{11}(o*)\\2{14}(o*)\\3{15}$" (String.make 281 'o') with | [| x; y; z |] -> raise (Found (String.length x, String.length y, String.length z)) | _ -> raise Not_found with | Found (x, y, z) -> Printf.printf "One solution is: x=%d; y=%d; z=%d.\n" x y z | Not_found -> Printf.printf "No solution.\n" (* One solution is: x=17; y=3; z=2. *) (*-----------------------------*) ~pat:"^(o+)\\1{11}(o+)\\2{14}(o+)\\3{15}$" (* One solution is: x=17; y=3; z=2. *) ~pat:"^(o*?)\\1{11}(o*)\\2{14}(o*)\\3{15}$" (* One solution is: x=0; y=7; z=11. *) ~pat:"^(o+?)\\1{11}(o*)\\2{14}(o*)\\3{15}$" (* One solution is: x=1; y=3; z=14. *) |
#directory "+pcre";; #load "pcre.cma";; let pat = input_line config_channel let () = if Pcre.pmatch ~pat data then (* ... *) () (*-----------------------------*) (* alpha OR beta *) let regexp = Pcre.regexp "alpha|beta" (* alpha AND beta *) let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?=.*alpha)(?=.*beta)" (* alpha AND beta, no overlap *) let regexp = Pcre.regexp ~flags:[`DOTALL] "alpha.*beta|beta.*alpha" (* NOT pat *) let regexp = Pcre.regexp ~flags:[`DOTALL] "^(?:(?!pat).)*$" (* NOT bad BUT good *) let regexp = Pcre.regexp ~flags:[`DOTALL] "(?=(?:(?!bad).)*$)good" (*-----------------------------*) let () = if not (Pcre.pmatch ~rex:regexp text) then something () (*-----------------------------*) let () = if (Pcre.pmatch ~rex:rexexp1 text) && (Pcre.pmatch ~rex:rexexp2 text) then something () (*-----------------------------*) let () = if (Pcre.pmatch ~rex:rexexp1 text) || (Pcre.pmatch ~rex:rexexp2 text) then something () (*-----------------------------*) #!/usr/bin/ocaml (* minigrep - trivial grep *) #directory "+pcre";; #load "pcre.cma";; let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) let minigrep pat files = let rex = try Pcre.regexp pat with Pcre.BadPattern (msg, _) -> Printf.eprintf "%s: Bad pattern %s: %s\n" Sys.argv.(0) pat msg; exit 1 in let process file = let channel = if file = "-" then stdin else open_in file in try Stream.iter (fun line -> if Pcre.pmatch ~rex line then print_endline line) (line_stream_of_channel channel); close_in channel with e -> close_in channel; raise e in List.iter process files let () = match List.tl (Array.to_list Sys.argv) with | pat :: [] -> minigrep pat ["-"] | pat :: files -> minigrep pat files | [] -> Printf.eprintf "usage: %s pat [files]\n" Sys.argv.(0) (*-----------------------------*) let string = "labelled" let () = Printf.printf "%b\n" (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`DOTALL] "^(?=.*bell)(?=.*lab)") string) let () = Printf.printf "%b\n" (Pcre.pmatch ~pat:"bell" string && Pcre.pmatch ~pat:"lab" string) let () = if (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] " ^ # start of string (?= # zero-width lookahead .* # any amount of intervening stuff bell # the desired bell string ) # rewind, since we were only looking (?= # and do the same thing .* # any amount of intervening stuff lab # and the lab part )") string) then print_endline "Looks like Bell Labs might be in Murray Hill!" let () = Printf.printf "%b\n" (Pcre.pmatch ~pat:"(?:^.*bell.*lab)|(?:^.*lab.*bell)" string) let brand = "labelled" let () = if (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] " (?: # non-capturing grouper ^ .*? # any amount of stuff at the front bell # look for a bell .*? # followed by any amount of anything lab # look for a lab ) # end grouper | # otherwise, try the other direction (?: # non-capturing grouper ^ .*? # any amount of stuff at the front lab # look for a lab .*? # followed by any amount of anything bell # followed by a bell ) # end grouper ") brand) then print_endline "Our brand has bell and lab separate." let map = "a map of the world" let () = Printf.printf "%b\n" (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`DOTALL] "^(?:(?!waldo).)*$") map) let () = if (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] " ^ # start of string (?: # non-capturing grouper (?! # look ahead negation waldo # is he ahead of us now? ) # is so, the negation failed . # any character (cuzza /s) ) * # repeat that grouping 0 or more $ # through the end of the string ") map) then print_endline "There's no waldo here!" (*-----------------------------*) % w | minigrep '^(?!.*ttyp).*tchrist' (*-----------------------------*) Pcre.regexp ~flags:[`EXTENDED] " ^ # anchored to the start (?! # zero-width look-ahead assertion .* # any amount of anything (faster than .*?) ttyp # the string you don't want to find ) # end look-ahead negation; rewind to start .* # any amount of anything (faster than .*?) tchrist # now try to find Tom " (*-----------------------------*) % w | grep tchrist | grep -v ttyp (*-----------------------------*) % grep -i 'pattern' files % minigrep '(?i)pattern' files |
#load "str.cma";; (* Regexp text for an EUC-JP character *) let eucjp = (String.concat "\\|" (* EUC-JP encoding subcomponents: *) [ (* ASCII/JIS-Roman (one-byte/character) *) "[\x00-\x7F]"; (* half-width katakana (two bytes/char) *) "\x8E[\xA0-\xDF]"; (* JIS X 0212-1990 (three bytes/char) *) "\x8F[\xA1-\xFE][\xA1-\xFE]"; (* JIS X 0208:1997 (two bytes/char) *) "[\xA1-\xFE][\xA1-\xFE]"; ]) (* Match any number of EUC-JP characters preceding Tokyo *) let regexp = Str.regexp ("\\(\\(" ^ eucjp ^ "\\)*\\)\\(\xC5\xEC\xB5\xFE\\)") (* Search from the beginning for a match *) let () = if Str.string_match regexp string 0 then print_endline "Found Tokyo" (* Replace Tokyo with Osaka *) let () = let buffer = Buffer.create (String.length string) in let start = ref 0 in while Str.string_match regexp string !start do Buffer.add_string buffer (Str.matched_group 1 string); Buffer.add_string buffer osaka; (* Assuming osaka is defined *) start := Str.match_end (); done; if !start < String.length string then Buffer.add_substring buffer string !start (String.length string - !start); print_endline (Buffer.contents buffer) (* Split a multi-byte string into characters *) let () = (* One character per list element *) let chars = Array.map (function | Str.Delim c -> c | Str.Text c -> failwith ("invalid char: " ^ c)) (Array.of_list (Str.full_split (Str.regexp eucjp) string)) in let length = Array.length chars in for i = 0 to length - 1 do if String.length chars.(i) = 1 then begin (* Do something interesting with this one-byte character *) end else begin (* Do something interesting with this multi-byte character *) end done; (* Glue list back together *) let line = String.concat "" (Array.to_list chars) in print_endline line (* Determine if an entire string is valid EUC-JP *) let is_eucjp s = Str.string_match (Str.regexp ("\\(" ^ eucjp ^ "\\)*$")) s 0 (* Assuming a similar string has been defined for Shift-JIS *) let is_sjis s = Str.string_match (Str.regexp ("\\(" ^ sjis ^ "\\)*$")) s 0 (* Convert from EUC-JP to Unicode, assuming a Hashtbl named euc2uni is defined with the appropriate character mappings *) let () = let chars = Array.map (function | Str.Delim c -> c | Str.Text c -> failwith ("invalid char: " ^ c)) (Array.of_list (Str.full_split (Str.regexp eucjp) string)) in let length = Array.length chars in for i = 0 to length - 1 do if Hashtbl.mem euc2uni chars.(i) then begin chars.(i) <- (Hashtbl.find euc2uni chars.(i)) end else begin (* deal with unknown EUC->Unicode mapping here *) end done; let line = String.concat "" (Array.to_list chars) in print_endline line |
#load "str.cma";; (* Not foolproof, but works in most common cases. *) let regexp = Str.regexp_case_fold "\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z][A-Z][A-Z]?[A-Z]?\\b" let () = try while true do print_string "Email: "; flush stdout; let line = input_line stdin in try let start = ref 0 in while true do start := Str.search_forward regexp line !start; let string = Str.matched_string line in start := !start + String.length string; print_string "Found: "; print_endline string; done with Not_found -> () done with End_of_file -> () |
#load "str.cma";; let () = try while true do print_string "Action: "; flush stdout; let answer = input_line stdin in let regexp = Str.regexp_string_case_fold answer in if Str.string_match regexp "SEND" 0 then print_endline "Action is send" else if Str.string_match regexp "STOP" 0 then print_endline "Action is stop" else if Str.string_match regexp "ABORT" 0 then print_endline "Action is abort" else if Str.string_match regexp "LIST" 0 then print_endline "Action is list" else if Str.string_match regexp "EDIT" 0 then print_endline "Action is edit" done with End_of_file -> () (*-----------------------------*) (* assumes that invoke_editor, deliver_message, *) (* file and pager are defined somewhere else. *) let actions = [ "edit", invoke_editor; "send", deliver_message; "list", (fun () -> ignore (Sys.command (pager ^ " " ^ file))); "abort", (fun () -> print_endline "See ya!"; exit 0); ] let errors = ref 0 let () = try while true do print_string "Action: "; flush stdout; let answer = input_line stdin in (* trim leading white space *) let answer = Str.replace_first (Str.regexp "^[ \t]+") "" answer in (* trim trailing white space *) let answer = Str.replace_first (Str.regexp "[ \t]+$") "" answer in let regexp = Str.regexp_string_case_fold answer in let found = ref false in List.iter (fun (action, handler) -> if Str.string_match regexp action 0 then (found := true; handler ())) actions; if not !found then (incr errors; print_endline ("Unknown command: " ^ answer)) done with End_of_file -> () |
#!/usr/bin/ocaml (* urlify - wrap HTML links around URL-like constructs *) #directory "+pcre";; #load "pcre.cma";; let urls = "(http|telnet|gopher|file|wais|ftp)" let ltrs = "\\w" let gunk = "/#~:.?+=&%@!\\-" let punc = ".:?\\-" let any = ltrs ^ gunk ^ punc let rex = Pcre.regexp ~flags:[`CASELESS; `EXTENDED] (Printf.sprintf " \\b # start at word boundary ( # begin $1 { %s : # need resource and a colon [%s] +? # followed by one or more # of any valid character, but # be conservative and take only # what you need to.... ) # end $1 } (?= # look-ahead non-consumptive assertion [%s]* # either 0 or more punctuation [^%s] # followed by a non-url char | # or else $ # then end of the string ) " urls any punc any) let templ = "<A HREF=\"$1\">$1</A>" let () = try while true do let line = input_line stdin in print_endline (Pcre.replace ~rex ~templ line) done with End_of_file -> () |
% tcgrep -ril '^From: .*kate' ~/mail (*-----------------------------*) #!/usr/bin/ocaml (* tcgrep: rewrite of tom christiansen's rewrite of grep *) #load "unix.cma";; (* http://ocaml.info/home/ocaml_sources.html#pcre-ocaml *) #directory "+pcre";; #load "pcre.cma";; (* http://alain.frisch.fr/soft.html#Getopt *) #directory "+getopt";; #load "getopt.cma";; (* Initialize globals. *) let me = Pcre.replace ~pat:".*/" Sys.argv.(0) let matches = ref 0 let errors = ref 0 let grand_total = ref 0 let mult = ref false let compress = [".z", "zcat"; ".gz", "zcat"; ".Z", "zcat"] (* Prints usage and exits. *) let usage () = Printf.eprintf "usage: %s [flags] [files] Standard grep options: i case insensitive n number lines c give count of lines matching C ditto, but >1 match per line possible w word boundaries only s silent mode x exact matches only v invert search sense (lines that DON'T match) h hide filenames e expression (for exprs beginning with -) f file with expressions l list filenames matching Specials: 1 1 match per file H highlight matches u underline matches r recursive on directories or dot if none t process directories in 'ls -t' order p paragraph mode (default: line mode) P ditto, but specify separator, e.g. -P '%%' a all files, not just plain text files (not implemented) q quiet about failed file and dir opens T trace files as opened May use a TCGREP environment variable to set default options. " me; exit 255 (* Produces a stream of lines from an input channel. *) let line_stream_of_channel channel = Stream.from (fun _ -> try Some (input_line channel) with End_of_file -> None) (* Produces a stream of chunks from an input channel given a delimiter. *) let delimited_stream_of_channel delim channel = let lines = line_stream_of_channel channel in let rec next para_lines i = match Stream.peek lines, para_lines with | None, [] -> None | Some delim', [] when delim' = delim -> Stream.junk lines; next para_lines i | Some delim', _ when delim' = delim -> Some (String.concat "\n" (List.rev para_lines)) | None, _ -> Some (String.concat "\n" (List.rev para_lines)) | Some line, _ -> Stream.junk lines; next (line :: para_lines) i in Stream.from (next []) (* An empty delimiter corresponds to an empty line, so we can create a paragraph stream in terms of the previous function. *) let paragraph_stream_of_channel = delimited_stream_of_channel "" (* By default, the stream builder will produce lines. This can be changed by the -p and -P options. *) let stream_of_channel = ref line_stream_of_channel (* Type for command-line options and their values. *) type opt = OBool of bool | OStr of string (* Set an option. *) let opt_set opt c = Hashtbl.replace opt c (OBool true) (* Test an option. *) let opt_test opt c = try match Hashtbl.find opt c with | OBool b -> b | OStr "" -> false | OStr _ -> true with Not_found -> false (* Convert an option to a string. *) let opt_str opt c = try match Hashtbl.find opt c with | OBool b -> string_of_bool b | OStr s -> s with Not_found -> "" (* Gets terminal escape characters. *) let tput cap = let ch = Unix.open_process_in ("tput " ^ cap) in try let result = input_line ch in ignore (Unix.close_process_in ch); result with | End_of_file -> ignore (Unix.close_process_in ch); "" | e -> ignore (Unix.close_process_in ch); raise e (* Splits a filename into its base and extension. *) let splitext name = try let base = Filename.chop_extension name in let i = String.length base in let ext = String.sub name i (String.length name - i) in base, ext with Invalid_argument _ -> name, "" (* Parses command-line arguments. *) let parse_args () = let opt = Hashtbl.create 0 in let args = ref [] in let optstring = "incCwsxvhe:f:l1HurtpP:aqT" in let optstream = Stream.of_string optstring in (* Prepare options for Getopt. *) let opts = let str_setter c = (c, Getopt.nolong, None, Some (fun s -> Hashtbl.replace opt c (OStr s))) in let int_setter c = (c, Getopt.nolong, Some (fun () -> Hashtbl.replace opt c (OBool true)), None) in let rec loop acc = match Stream.peek optstream with | Some c -> (Stream.junk optstream; match Stream.peek optstream with | Some ':' -> Stream.junk optstream; loop (str_setter c :: acc) | _ -> loop (int_setter c :: acc)) | None -> List.rev acc in loop [] in (* Handle TCGREP environment variable. *) let cmdline = ref (List.tl (Array.to_list Sys.argv)) in Array.iter (fun env -> if (String.length env > 7 && String.sub env 0 7 = "TCGREP=") then begin let s = String.sub env 7 (String.length env - 7) in let s = if s.[0] <> '-' then "-" ^ s else s in cmdline := s :: !cmdline end) (Unix.environment ()); let cmdline = Array.of_list !cmdline in (* Parse command-line options using Getopt. *) begin try Getopt.parse opts (fun arg -> args := arg :: !args) cmdline 0 (Array.length cmdline - 1); args := List.rev !args with Getopt.Error e -> prerr_endline e; usage () end; (* Read patterns from file or command line. *) let patterns = if opt_test opt 'f' then begin let in_channel = try open_in (opt_str opt 'f') with e -> Printf.eprintf "%s: can't open %s: %s\n" me (opt_str opt 'f') (Printexc.to_string e); exit 255 in try let acc = ref [] in Stream.iter (fun pat -> acc := pat :: !acc) (line_stream_of_channel in_channel); close_in in_channel; List.rev !acc with e -> close_in in_channel; Printf.eprintf "%s: error reading %s: %s\n" me (opt_str opt 'f') (Printexc.to_string e); exit 255 end else if opt_test opt 'e' then [opt_str opt 'e'] else [match !args with h :: t -> (args := t; h) | [] -> usage ()] in (* Terminal escape characters for highlighting options. *) let highlight = if opt_test opt 'H' then tput "smso" ^ "$1" ^ tput "rmso" else if opt_test opt 'u' then tput "smul" ^ "$1" ^ tput "rmul" else "$1" in (* Regular expression flags to use. *) let flags = ref [] in if opt_test opt 'i' then flags := `CASELESS :: !flags; (* Options for paragraph modes. *) if opt_test opt 'p' then stream_of_channel := paragraph_stream_of_channel; if opt_test opt 'P' then stream_of_channel := delimited_stream_of_channel (opt_str opt 'P'); (* Word boundary option. *) let patterns = if opt_test opt 'w' then List.map (fun pat -> "\\b" ^ pat ^ "\\b") patterns else patterns in (* Exact match option. *) let patterns = if opt_test opt 'x' then List.map (fun pat -> "^" ^ pat ^ "$") patterns else patterns in (* Options that imply other options. *) if opt_test opt 'l' then opt_set opt '1'; if opt_test opt 'u' then opt_set opt 'H'; if opt_test opt 'C' then opt_set opt 'c'; if opt_test opt 'c' then opt_set opt 's'; if opt_test opt 's' && not (opt_test opt 'c') then opt_set opt '1'; (* Compile the regular expression patterns. *) let rexes = List.map (fun pat -> try Pcre.regexp ~flags:!flags ("(" ^ pat ^ ")") with Pcre.BadPattern (msg, _) -> Printf.eprintf "%s: bad pattern %s: %s\n" me pat msg; exit 255) patterns in (* Increments the matches variable by the number of matches (or non-matches) in the given line. *) let count_matches line = if opt_test opt 'v' then fun rex -> (if not (Pcre.pmatch ~rex line) then incr matches) else if opt_test opt 'C' then fun rex -> (matches := !matches + (try Array.length (Pcre.extract_all ~rex line) with Not_found -> 0)) else fun rex -> (if Pcre.pmatch ~rex line then incr matches) in (* Counts matches in a line and returns the line with any necessary highlighting. *) let matcher line = List.iter (count_matches line) rexes; if opt_test opt 'H' then List.fold_left (fun line rex -> Pcre.replace ~rex ~templ:highlight line) line rexes else line in (* List of files or directories to process. *) let files = match !args with | [] -> if opt_test opt 'r' then ["."] else ["-"] | [arg] -> [arg] | args -> (mult := true; args) in (* Overrides for options that affect the multiple-file flag. *) if opt_test opt 'h' then mult := false; if opt_test opt 'r' then mult := true; (* Return the three values to be processed by matchfiles. *) opt, matcher, files (* Used to break out of loops and abort processing of the current file. *) exception NextFile (* Runs given matcher on a list of files using the specified options. *) let rec matchfiles opt matcher files = (* Handles a single directory. *) let matchdir dir = if not (opt_test opt 'r') then begin if opt_test opt 'T' then (Printf.eprintf "%s: \"%s\" is a directory, but no -r given\n" me dir; flush stderr) end else begin let files = try Some (Sys.readdir dir) with e -> if not (opt_test opt 'q') then (Printf.eprintf "%s: can't readdir %s: %s\n" me dir (Printexc.to_string e); flush stderr); incr errors; None in match files with | Some files -> let by_mtime a b = compare (Unix.stat (Filename.concat dir b)).Unix.st_mtime (Unix.stat (Filename.concat dir a)).Unix.st_mtime in if opt_test opt 't' then Array.sort by_mtime files; matchfiles opt matcher (Array.to_list (Array.map (Filename.concat dir) files)) | None -> () end in (* Handles a single file. *) let matchfile file = (* Keep a running total of matches for this file. *) let total = ref 0 in (* Keep track of the current line number. *) let line_num = ref 0 in (* Shadow close_in to properly close process channels for compressed files and avoid closing stdin. *) let process_open = ref false in let close_in channel = if !process_open then ignore (Unix.close_process_in channel) else if channel != stdin then close_in channel in (* Process a line (or paragraph, with -p or -P) of input. *) let matchline line = incr line_num; matches := 0; let line = matcher line in if !matches > 0 then begin total := !total + !matches; grand_total := !grand_total + !matches; if opt_test opt 'l' then (print_endline file; raise NextFile) else if not (opt_test opt 's') then (Printf.printf "%s%s%s%s\n" (if !mult then file ^ ":" else "") (if opt_test opt 'n' then (string_of_int !line_num ^ (if opt_test opt 'p' || opt_test opt 'P' then ":\n" else ":")) else "") line (if opt_test opt 'p' || opt_test opt 'P' then "\n" ^ String.make 20 '-' else ""); flush stdout); if opt_test opt '1' then raise NextFile end in (* Get a channel for the file, starting a decompression process if necessary. If None, the file will be skipped. *) let maybe_channel = if file = "-" then (if Unix.isatty Unix.stdin && not (opt_test opt 'q') then (Printf.eprintf "%s: reading from stdin\n" me; flush stderr); Some stdin) else if not (Sys.file_exists file) then (if not (opt_test opt 'q') then (Printf.eprintf "%s: file \"%s\" does not exist\n" me file; flush stderr); incr errors; None) else if List.mem_assoc (snd (splitext file)) compress then (process_open := true; try Some (Unix.open_process_in (List.assoc (snd (splitext file)) compress ^ " < " ^ file)) with e -> if not (opt_test opt 'q') then (Printf.eprintf "%s: %s: %s\n" me file (Printexc.to_string e); flush stderr); incr errors; None) else (try Some (open_in file) with e -> if not (opt_test opt 'q') then (Printf.eprintf "%s: %s: %s\n" me file (Printexc.to_string e); flush stderr); incr errors; None) in (* Run matcher on the open channel, then close the channel. *) match maybe_channel with | None -> () | Some channel -> begin try if opt_test opt 'T' then (Printf.eprintf "%s: checking %s\n" me file; flush stderr); Stream.iter matchline (!stream_of_channel channel); close_in channel with | NextFile -> close_in channel | e -> close_in channel; raise e end; if opt_test opt 'c' then (Printf.printf "%s%d\n" (if !mult then file ^ ":" else "") !total; flush stdout) in (* Handle each of the specified files and directories. *) List.iter (fun file -> if file = "-" then matchfile file else if try Sys.is_directory file with _ -> false then matchdir file else matchfile file) files (* Parse command line arguments, run matcher, and set result status. *) let opt, matcher, files = parse_args () let () = matchfiles opt matcher files; if !errors > 0 then exit 2; if !grand_total > 0 then exit 0; exit 1 |
#directory "+pcre";; #load "pcre.cma";; (*-----------------------------*) Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`CASELESS] "^m*(d?c{0,3}|c[dm])(l?x{0,3}|x[lc])(v?i{0,3}|i[vx])$") input (*-----------------------------*) Pcre.replace ~pat:"(\\S+)(\\s+)(\\S+)" ~templ:"$3$2$1" input (*-----------------------------*) Pcre.extract ~full_match:false ~pat:"(\\w+)\\s*=\\s*(.*)\\s*$" input (*-----------------------------*) Pcre.pmatch ~pat:".{80,}" input (*-----------------------------*) Pcre.extract ~full_match:false ~pat:"(\\d+)/(\\d+)/(\\d+) (\\d+):(\\d+):(\\d+)" input (*-----------------------------*) Pcre.replace ~pat:"/usr/bin" ~templ:"/usr/local/bin" input (*-----------------------------*) Pcre.substitute_substrings ~pat:"%([0-9A-Fa-f][0-9A-Fa-f])" ~subst:(fun subs -> let c = Pcre.get_substring subs 1 in String.make 1 (Char.chr (int_of_string ("0x" ^ c)))) input (*-----------------------------*) Pcre.replace ~rex:(Pcre.regexp ~flags:[`DOTALL; `EXTENDED] " /\\* # Match the opening delimiter .*? # Match a minimal number of characters \\*/ # Match the closing delimiter ") input (*-----------------------------*) Pcre.replace ~pat:"^\\s+" input Pcre.replace ~pat:"\\s+$" input (*-----------------------------*) Pcre.replace ~pat:"\\\\n" ~templ:"\n" input (*-----------------------------*) Pcre.replace ~pat:"^.*::" input (*-----------------------------*) Pcre.extract ~full_match:false ~pat:"^([01]?\\d\\d|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d|2[0-4]\\d|25[0-5])\\.\ ([01]?\\d\\d|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d|2[0-4]\\d|25[0-5])$" input (*-----------------------------*) Pcre.replace ~pat:"^.*/" input (*-----------------------------*) let termcap = ":co#80:li#24:" let cols = try int_of_string (Pcre.extract ~pat:":co#(\\d+):" termcap).(1) with Not_found | Failure "int_of_string" -> 80 (*-----------------------------*) let name = Pcre.replace ~pat:" /\\S+/" ~templ:" " (" " ^ String.concat " " (Array.to_list Sys.argv)) (*-----------------------------*) #load "unix.cma";; let () = let ch = Unix.open_process_in "uname -a" in let os = input_line ch in ignore (Unix.close_process_in ch); if not (Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`CASELESS] "linux") os) then print_endline "This isn't Linux" (*-----------------------------*) Pcre.replace ~pat:"\n\\s+" ~templ:" " input (*-----------------------------*) let nums = Array.map (fun group -> float_of_string group.(1)) (Pcre.extract_all ~pat:"(\\d+\\.?\\d*|\\.\\d+)" input) (*-----------------------------*) let capwords = Array.map (fun group -> group.(1)) (Pcre.extract_all ~pat:"(\\b[^\\Wa-z0-9_]+\\b)" input) (*-----------------------------*) let lowords = Array.map (fun group -> group.(1)) (Pcre.extract_all ~pat:"(\\b[^\\WA-Z0-9_]+\\b)" input) (*-----------------------------*) let icwords = Array.map (fun group -> group.(1)) (Pcre.extract_all ~pat:"(\\b[^\\Wa-z0-9_][^\\WA-Z0-9_]*\\b)" input) (*-----------------------------*) let links = Array.map (fun group -> group.(1)) (Pcre.extract_all ~rex:(Pcre.regexp ~flags:[`DOTALL; `CASELESS] "<A[^>]+?HREF\\s*=\\s*[\"']?([^'\" >]+?)[ '\"]?>") input) (*-----------------------------*) let initial = try (Pcre.extract ~pat:"^\\S+\\s+(\\S)\\S*\\s+\\S" input).(1) with Not_found -> "" (*-----------------------------*) Pcre.replace ~pat:"\"([^\"]*)\"" ~templ:"``$1''" input (*-----------------------------*) let sentences = Array.map (fun group -> group.(1)) (Pcre.extract_all ~pat:"(\\S.*?\\pP)(?= |\\Z)" (Pcre.replace ~pat:" {3,}" ~templ:" " (Pcre.replace ~pat:"\n" ~templ:" " (Pcre.replace ~pat:"(\\pP\n)" ~templ:"$1 " input)))) (*-----------------------------*) Pcre.extract ~full_match:false ~pat:"(\\d{4})-(\\d\\d)-(\\d\\d)" input (*-----------------------------*) Pcre.pmatch ~pat:"^[01]?[- .]?(\\([2-9]\\d{2}\\)|[2-9]\\d{2})[- .]?\\d{3}[- .]?\\d{4}$" input (*-----------------------------*) Pcre.pmatch ~rex:(Pcre.regexp ~flags:[`CASELESS] "\\boh\\s+my\\s+gh?o(d(dess(es)?|s?)|odness|sh)\\b") input (*-----------------------------*) let lines = Array.map (fun group -> group.(1)) (Pcre.extract_all ~pat:"([^\010\013]*)(\010\013?|\013\010?)" input) |