5. Hashes

Introduction

(*-----------------------------*)
(* build an hash table element by element *)
let age = Hashtbl.create 3 ;;  (* 3 is the supposed average size for the
                                  hash table *)
Hashtbl.replace age "Nat" 24 ;
Hashtbl.replace age "Jules" 25 ;
Hashtbl.replace age "Josh" 17 ;;
(*-----------------------------*)
let assoc_list2hashtbl assoc_list = 
  let h = Hashtbl.create 0 in
  List.iter (fun (k,v) -> Hashtbl.replace h k v) assoc_list ;
  h

let food_color = assoc_list2hashtbl 
    [ "Apple", "red" ; 
      "Banana", "yellow" ; 
      "Lemon", "yellow" ; 
      "Carrot", "orange" ;
    ] ;;
(*-----------------------------*)

Adding an Element to a Hash

(*-----------------------------*)
Hashtbl.replace tbl key value ;;
(*-----------------------------*)
(* food_color defined per the introduction *)
Hashtbl.replace food_color "Raspberry" "pink" ;;


let hashtbl_keys h = Hashtbl.fold (fun key _ l -> key :: l) h []
let hashtbl_values h = Hashtbl.fold (fun _ value l -> value :: l) h []
let hashtbl2assoc_list h = Hashtbl.fold (fun key value l -> (key, value) :: l) h []
;;
print_string "Known_foods:\n" ;
Hashtbl.iter (fun food _ -> print_endline food) food_color ;
print_string "Known_foods:\n" ;
List.iter print_endline (hashtbl_keys food_color) ;;
(*
> Known_foods:
> Banana
> Raspberry
> Apple
> Carrot
> Lemon
*)
(*-----------------------------*)

Testing for the Presence of a Key in a Hash

(*-----------------------------*)
(* does %HASH have a value for $KEY ?  *)
if (Hashtbl.mem hash key) then
  (* it exists *)
else
  (* id doesn't exists *)
 ;;
(*-----------------------------*)
(* food_color defined per the introduction *)
List.iter (fun name ->
  let kind = if Hashtbl.mem food_color name then "food" else "drink" in
  printf "%s is a %s.\n" name kind
) ["Banana"; "Martini"] ;;
(*
> Banana is a food.
> Martini is a drink.
*)
(*-----------------------------*)
(* there's no such thing called "undef", "nil" or "null" in Caml
   if you really want such a value, use type "option" as shown below *)
let age = assoc_list2hashtbl 
    [ "Toddler", 3 ; "Unborn", 0 ] ;;
(*> val age : (string, int) Hashtbl.t = <abstr> *)

List.iter (fun thing ->
  printf "%s: %s\n" thing
    (try match Hashtbl.find age thing with
    | 0 -> "Exists"
    | _ -> "Exists NonNull"
    with Not_found -> "")
) ["Toddler" ; "Unborn" ; "Phantasm" ; "Relic" ]

let age = assoc_list2hashtbl 
    [ "Toddler", Some 3 ; "Unborn", Some 0 ; "Phantasm", None ] ;;
(*> val age : (string, int option) Hashtbl.t = <abstr> *)

List.iter (fun thing ->
  printf "%s: %s\n" thing
    (try match Hashtbl.find age thing with
    | None -> "Exists"
    | Some 0 -> "Exists Defined"
    | Some _ -> "Exists Defined NonNull"
    with Not_found -> "")
) ["Toddler" ; "Unborn" ; "Phantasm" ; "Relic" ]
(*
> Toddler: Exists Defined NonNull
> Unborn: Exists Defined
> Phantasm: Exists
> Relic: 
*)
(*-----------------------------*)
let size = Hashtbl.create 20 in
List.iter (fun f -> 
  if not (Hashtbl.mem size f) then
    Hashtbl.replace size f (Unix.stat f).Unix.st_size;
) (readlines stdin);
(*-----------------------------*)
(* here is a more complete solution which does stat 2 times the same file (to
be mimic perl's version) *)
let size = Hashtbl.create 20 in
List.iter (fun f -> 
  if not (Hashtbl.mem size f) then
    Hashtbl.replace size f (try Some (Unix.stat f).Unix.st_size with _ -> None)
) (readlines stdin);

Deleting from a Hash

(*-----------------------------*)
(* remove $KEY and its value from %HASH *)
Hashtbl.remove hash key ;
(*-----------------------------*)
(* food_color as per Introduction *)
open Printf

let print_foods () =
  printf "Keys: %s\n" (String.concat " " (hashtbl_keys food_color)) ;
  printf "Values: %s\n" (String.concat " " (hashtbl_values food_color))
;;
print_string "Initially:\n";
print_foods ();

print_string "\nWith Banana deleted\n";
Hashtbl.remove food_color "Banana";
print_foods ()
;;
(*-----------------------------*)
Hashtbl.clear food_color ;;
(*-----------------------------*)

Traversing a Hash

(*-----------------------------*)
(* in this section consider opened the Printf module using: *)
open Printf;;

Hashtbl.iter
  (fun key value ->
    (*
      do something with key and value
    *)
  )
  hash
;;
(*-----------------------------*)
List.iter (fun key ->
  let value = Hashtbl.find hash key in
    (*
      do something with key and value
    *)
) (hashtbl_keys hash)
;;
(*-----------------------------*)
(* food_color as defined in the introduction *)
Hashtbl.iter (printf "%s is %s.\n") food_color;
(*
> Lemon is yellow.
> Apple is red.
> Carrot is orange.
> Banana is yellow.
*)
(* but beware of: *)
Hashtbl.iter (printf "food_color: %s is %s.\n") food_color;
(*
> food_color: Lemon is yellow.
> Apple is red.
> Carrot is orange.
> Banana is yellow.
*)
(* write this instead:
  (more on it at http://caml.inria.fr/ocaml/htmlman/manual055.html) *)
Hashtbl.iter (fun k v -> printf "food_color: %s is %s.\n" k v) food_color;
(*
> food_color: Lemon is yellow.
> food_color: Apple is red.
> food_color: Carrot is orange.
> food_color: Banana is yellow.
*)

List.iter (fun key ->
  let value = Hashtbl.find food_color key in
  printf "%s is %s.\n" key value
) (hashtbl_keys food_color) ;
(*
> Lemon is yellow.
> Apple is red.
> Carrot is orange.
> Banana is yellow.
*)

(*-----------------------------*)
List.iter
  (fun key ->
    printf "%s is %s.\n" key (Hashtbl.find food_color key)
  )
  (sort_ (hashtbl_keys food_color))
;;

(*
> Apple is red.
> Banana is yellow.
> Carrot is orange.
> Lemon is yellow.
*)

(*-----------------------------*)
(* Ocaml is safe in loop, so you can't reset the hash iterator as in
Perl and you don't risk infinite loops using, say, List.iter or
Hashtbl.iter, but if you really want to infinite loop on the first key
you get ... *)
List.iter
  (fun key ->
    while true do
      printf "Processing %s\n" key
    done
  )
  (hashtbl_keys food_color)
;;
(*-----------------------------*)
(* countfrom - count number of messages from each sender *)
let main () =
  let file =
    let files = ref [] in
    Arg.parse [] (fun file -> files := !files @ [file]) "";
    try
      open_in (List.hd !files)
    with Failure "hd" -> stdin
  in
  let from = Hashtbl.create 50 in
  let add_from address =
    let old_count =
      try Hashtbl.find from address
      with Not_found -> 0
    in
    let new_count = old_count + 1 in
    Hashtbl.replace from address new_count;
  in
  let extractfrom = Str.regexp "^From: \(.*\)" in

  iter_lines (fun line ->
    if (Str.string_match extractfrom line 0) then
      add_from (Str.matched_group 1 line)
    else ()
  ) file;
  Hashtbl.iter (printf "%s: %d\n") from
;;
main() ;

Printing a Hash

(*-----------------------------*)
(* note that OCaml does not have a native polymorphic print function, so
examples in this section work for hashes that map string keys to string
values *)
Hashtbl.iter (printf "%s => %s\n") hash ;
(*-----------------------------*)

(* map in ocaml maps a function on a list, rather that evaluate an
expression in turn on a list as Perl does *)
List.iter
  (fun key ->
    printf "%s => %s\n" key (Hashtbl.find hash key)
  )
  (hashtbl_keys hash) ;
(*-----------------------------*)

(* build a list from an hash table, note that this is possibile only if
the type of key and value are the same *)
let hashtbl2list hash =
  Hashtbl.fold
    (fun key value init -> key :: value :: init)
    hash
    []
;;
List.iter (printf "%s ") (hashtbl2list hash) ;
(* or *)
print_endline (String.concat " " (hashtbl2list hash)) ;

Retrieving from a Hash in Insertion Order

(*-----------------------------*)
(* In OCaml one usually use association lists which really is a list of
(key,value). Note that insertion and lookup is O(n) (!!!) *)

(* initialization *)
let empty_food_color = []
let food_color = 
    [ "Banana", "Yellow" ; 
      "Apple", "Green" ; 
      "Lemon", "Yellow" ; 
    ]
(* adding *)
let food_color' = food_color @ [ "Carrot", "orange" ]
;;
(* output entries in insertion order *)
print_endline "In insertion order, the foods are:";
List.iter (printf "%s is colored %s.\n") food_color;
(*
> Banana is colored Yellow.
> Apple is colored Green.
> Lemon is colored Yellow.
*)
(* is it a key? *)
let has_food food = mem_assoc food food_color
(* remove a key *)
let remove_food food = remove_assoc food food_color
(* searching *)
let what_color food =
  try
    let color = assoc food food_color in
    printf "%s is colored %s.\n" food color
  with Not_found -> printf "i don't know the color of %s\n" food
;;

Hashes with Multiple Values Per Key

(*-----------------------------*)
let re = Str.regexp "^\([^ ]*\) *\([^ ]*\)" in
let lines = readlines (Unix.open_process_in "who") in
let ttys = filter_some (List.map (fun line ->
  if (Str.string_match re line 0) then
    Some(Str.matched_group 1 line, Str.matched_group 2 line)
  else None) lines) in
List.iter
  (fun user ->
    printf "%s: %s\n" user (String.concat " " (all_assoc user ttys))
  ) (sort_ (uniq (List.map fst ttys)))
;
(*-----------------------------*)
List.iter
  (fun user ->
    let ttylist = all_assoc user ttys in
    printf "%s: %d ttys.\n" user (List.length ttylist);
    List.iter
      (fun tty ->
        let uname =
          try
            let uid = (Unix.stat ("/dev/" ^ tty)).Unix.st_uid in
            (Unix.getpwuid uid).Unix.pw_name
          with Unix.Unix_error _ -> "(not available)"
        in
        printf "%s (owned by %s)\n" tty uname
      ) ttylist
  ) (sort_ (uniq (List.map fst ttys)))
(*-----------------------------*)

Inverting a Hash

(*-----------------------------*)

open Hashtbl

(* size of an hash, i.e. number of bindings *)
let hashtbl_size h = List.length (hashtbl_keys h);;

(* in OCaml does not exists a builtin function like "reverse", here is
an equivalent one: *)
let hashtbl_reverse h =
  assoc_list2hashtbl (List.combine (hashtbl_values h) (hashtbl_keys h))
(* or *)
let hashtbl_reverse h =
  assoc_list2hashtbl (List.map (fun (a,b) -> (b,a)) (hashtbl2assoc_list h))
;;
(* or *)
let hashtbl_reverse_multi h =
  let newhash = Hashtbl.create (hashtbl_size h) in
  List.iter
    (fun v -> add newhash (find h v) v)
    (hashtbl_keys h);
  newhash
(* note that the last  implementation maintain also multiple binding for the
same key, see Hashtbl.add in the standard OCaml library for more info *)

(*-----------------------------*)
(* example of hashtbl_reverse *)

let reverse = hashtbl_reverse lookup;;
(*-----------------------------*)
let surname = assoc_list2hashtbl ["Mickey", "Mantle"; "Babe", "Ruth"] in
let firstname = hashtbl_reverse surname in
print_endline (Hashtbl.find firstname "Mantle");;
(*
> Mickey
*)

(*-----------------------------*)
(* foodfind - find match for food or color *)

let given = Sys.argv.(1) in
let color = assoc_list2hashtbl
  ["Apple", "red";
   "Banana", "yellow";
   "Lemon", "yellow";
   "Carrot", "orange"] in
let food = hashtbl_reverse color in
(try
  printf "%s is a food with color %s.\n" given (Hashtbl.find color given);
with Not_found -> ());
(try
  printf "%s is a food with color %s.\n" (Hashtbl.find food given) given
with Not_found -> ())
;;
(*-----------------------------*)
(* food_color defined as previous *)

let foods_with_color = hashtbl_reverse food_color in
List.iter (printf "%s ") (Hashtbl.find_all foods_with_color "yellow");
print_endline "were yellow foods."
;;
(*-----------------------------*)

Sorting a Hash

(*-----------------------------*)

(* you may define your own compare function to be used in sorting *)
let keys = List.sort compare_function (hashtbl_keys hash) in
List.iter
  (fun key ->
    let value = Hashtbl.find hash key in
    (* do something with key and value *)
    ()
  )
  keys ;
(* or use this one if you want to compare not only on keys *)
Hashtbl.iter
  (fun (key, value) ->
    (* do something with key and value *)
    ()
  ) (List.sort compare_function (hashtbl2assoc_list hash)) ;
(*-----------------------------*)
List.iter
  (fun food ->
    printf "%s is %s.\n" food (Hashtbl.find food_color food)
  )
  (List.sort (hashtbl_keys food_color))
;;
(*-----------------------------*)
(* examples of "compare_function": *)

(* alphabetical sort on the hash value *)
let compare_function (_,color1) (_,color2) = compare color1 color2

(* length sort on the hash value *)
let compare_function (_,color1) (_,color2) = compare (String.length color1) (String.length color2)

(*-----------------------------*)

Merging Hashes

(*-----------------------------*)
(* definition of merge function on hashes: *)
let hashtbl_merge h1 h2 = assoc_list2hashtbl (hashtbl2assoc_list h1 @ hashtbl2assoc_list h2)

(* usage: *)
let merged = hashtbl_merge a b;;
(*-----------------------------*)
let merged = Hashtbl.create 0 in
List.iter
  (Hashtbl.iter (fun k v -> Hashtbl.add merged k v))
  [a;b]
;;
(*-----------------------------*)
let drink_color = assoc_list2hashtbl
    ["Galliano", "yellow";
     "Mai Tai", "blue"]
;;

let ingested_color = hashtbl_merge drink_color food_color;;
(*-----------------------------*)
let substance_color = Hashtbl.create 0 in
List.iter
  (Hashtbl.iter (fun k v -> Hashtbl.add merged k v))
  [food_color; drink_color]
;;

Finding Common or Different Keys in Two Hashes

(*-----------------------------*)
let common =
  List.filter
    (fun key -> Hashtbl.mem hash2 key)
    (hashtbl_keys hash1)
;;
(* common now contains commne keys, note that a key may appear multiple
times in this list due tu multiple bindings allowed in Hashtbl
implementation *)

let this_not_that =
  List.filter
    (fun key -> not (Hashtbl.mem hash2 key))
    (hashtbl_keys hash1)
;;
(*-----------------------------*)
let citrus_color = assoc_list2hashtbl
                      ["Lemon", "yellow";
                       "Orange", "orange";
                       "Lime", "green"]
in
let non_citrus = Hashtbl.create 3 in
List.filter
  (fun key -> not (Hashtbl.mem citrus_color key))
  (hashtbl_keys food_color)
;;

(*-----------------------------*)

Hashing References

(*-----------------------------*)
open Unix;;
open Printf;;

let filenames = ["/etc/printcap"; "/vmlinuz"; "/bin/cat"] in
let openfiles = Hashtbl.create 3 in
print_newline();
List.iter
  (fun fname ->
    printf "%s is %d bytes long.\n"
      fname
      (stat fname).st_size
  )
  filenames
;;
  
(*-----------------------------*)

Presizing a Hash

(*-----------------------------*)

(* presize hash to num elements *)
let hash = Hashtbl.create num;;
(* other examples of initial size on hashes *)
let hash = Hashtbl.create 512;;
let hash = Hashtbl.create 1000;;

(*-----------------------------*)

Finding the Most Common Anything

(*-----------------------------*)

(* size of an array named "a" *)
let count = Array.length a;;

(* size of a list named "l" *)
let count = List.length l;;

(*-----------------------------*)

Representing Relationships Between Data

(*-----------------------------*)

open Printf;;
open Hashtbl;;

let father = assoc_list2hashtbl
  [ "Cain", "Adam";
    "Abel", "Adam";
    "Seth", "Adam";
    "Enoch", "Cain";
    "Irad", "Enoch";
    "Mehujael", "Irad";
    "Methusael", "Mehujael";
    "Lamech", "Methusael";
    "Jabal", "Lamech";
    "Jubal", "Lamech";
    "Tubalcain", "Lamech";
    "Enos", "Seth"] ;;
(*-----------------------------*)
(* recursively print all parents of a given name *)
let rec parents s =
  printf "%s " s;
  if mem father s then
    parents (find father s)
  else
    printf "\n"
in
  iter_lines parents stdin
;;
(*-----------------------------*)
let children = hashtbl_reverse_multi father in
iter_lines 
  (fun line ->
    List.iter (printf "%s ") (find_all children line);
    print_newline()
  )
  stdin;
;;
(*-----------------------------*)
(* build an hash that map filename to list of included file *)
open Hashtbl;;
open Str;;

let includes = create (List.length files);;
let includeRE = regexp "^#include <\([a-zA-Z0-9.]+\)>";;
let isincludeline l = string_match includeRE l 0;;
let getincludes fname =
  let includelines =
    List.filter isincludeline (readlines (open_in fname))
  in
  List.map (replace_first includeRE "\1") includelines
;;
List.iter (fun fname -> add includes fname (getincludes fname)) files;;

(*-----------------------------*)
(* build a list of files that does not include system headers *)
let hasnoinclude fname = (find includes fname = []) in
List.filter hasnoinclude (uniq (hashtbl_keys includes));;

(*-----------------------------*)

Program: dutree

(*-----------------------------*)

#!/usr/bin/ocaml
(* dutree - print sorted indented rendition of du output *)
#load "str.cma";;
#load "unix.cma";;

let dirsize = Hashtbl.create 0
let kids = Hashtbl.create 0

(* run du, read in input, save sizes and kids *)
(* return last directory (file?) read *)
let input () =
  let last_name = ref "" in
  let last_push = ref None in
  let argv = "du" :: List.tl (Array.to_list Sys.argv) in
  let ch = Unix.open_process_in (String.concat " " argv) in
  begin
    try
      while true do
        let line = input_line ch in
        match Str.bounded_split (Str.regexp "[ \t]+") line 2 with
          | [size; name] ->
              let size = int_of_string size in
              Hashtbl.replace dirsize name size;
              let parent =
                Str.replace_first (Str.regexp "/[^/]+$") "" name in
              last_name := name;
              last_push :=
                Some (parent,
                      try Some (Hashtbl.find kids parent)
                      with Not_found -> None);
              Hashtbl.replace kids parent
                (name ::
                   (try Hashtbl.find kids parent
                    with Not_found -> []))
          | _ -> failwith line
      done
    with End_of_file ->
      ignore (Unix.close_process_in ch)
  end;
  begin
    match !last_push with
      | None -> ()
      | Some (parent, None) ->
          Hashtbl.remove kids parent
      | Some (parent, Some previous) ->
          Hashtbl.replace kids parent previous
  end;
  !last_name

(* figure out how much is taken up in each directory *)
(* that isn't stored in subdirectories.  add a new *)
(* fake kid called "." containing that much. *)
let rec getdots root =
  let size = Hashtbl.find dirsize root in
  let cursize = ref size in
  if Hashtbl.mem kids root
  then
    begin
      List.iter
        (fun kid ->
           cursize := !cursize - Hashtbl.find dirsize kid;
           getdots kid)
        (Hashtbl.find kids root)
    end;
  if size <> !cursize
  then
    begin
      let dot = root ^ "/." in
      Hashtbl.replace dirsize dot !cursize;
      Hashtbl.replace kids root
        (dot ::
           (try Hashtbl.find kids root
            with Not_found -> []))
    end

(* recursively output everything, *)
(* passing padding and number width in as well *)
(* on recursive calls *)
let rec output ?(prefix="") ?(width=0) root =
  let path = Str.replace_first (Str.regexp ".*/") "" root in
  let size = Hashtbl.find dirsize root in
  let line = Printf.sprintf "%*d %s" width size path in
  Printf.printf "%s%s\n" prefix line;
  let prefix =
    Str.global_replace (Str.regexp "[^|]") " "
      (Str.replace_first (Str.regexp "[0-9] ") "| "
         (prefix ^ line)) in
  if Hashtbl.mem kids root
  then
    begin
      let kids = Hashtbl.find kids root in
      let kids =
        List.rev_map
          (fun kid -> (Hashtbl.find dirsize kid, kid)) kids in
      let kids = List.sort compare kids in
      let kids = List.rev_map (fun (_, kid) -> kid) kids in
      let width =
        String.length
          (string_of_int (Hashtbl.find dirsize (List.hd kids))) in
      List.iter (output ~prefix ~width) kids
    end

let () =
  let topdir = input () in
  getdots topdir;
  output topdir