Count different elements in list - Ocaml

Reading time ~4 minutes

It is quite common to have to count the occurences of each element in a list. Python would propose the Counter class, per example, and looking around, one would soon enough find the following snippet :

from collections import Counter

words = ['a', 'b', 'c', 'a']

Counter(words).keys() # equals to list(set(words))
Counter(words).values() # counts the elements' frequency

However, OCaml lacks the abundance of snippets to find here and there. This article will focus on counting the occurences of each element in a list, using various approaches.

A functional approach

A simple way relying on built-in List module would be to use sort_unique and count the element appearing in the list for each element returned by sort_unique.

let count_unique_elements_naive list = 
  let count_element e list = List.filter (fun x -> x = e) list |> List.length in
  List.sort_uniq Int.compare list 
  |> List.map (fun e -> (e, count_element e list))

The following could be put in a count.ml file in order to test the previous snippet:

let print_int_pair_list list =
  let string_of_int_pair pair = match pair with
  | a,b -> (string_of_int a)^" - "^(string_of_int b)^"\n" in
  List.map string_of_int_pair list 
  |> List.iter print_string

let count_unique_elements_naive list = 
  let count_element e list = List.filter (fun x -> x = e) list |> List.length in
  List.sort_uniq Int.compare list 
  |> List.map (fun e -> (e, count_element e list))


let () =
  count_unique_elements_naive [1; 1; 3; 5; 7; 7; 1]
  |> print_int_pair_list

Running ocaml count.ml would output the following:

1 - 3
3 - 1
5 - 1
7 - 2

Obviously, this method makes as many passes on the list as the number of unique elements in the list, which is a waste of time.

Using Hash tables

Here hash tables comes-in handy. Updating a counter for each element of the list, one can achieve the same results in a single pass over the data.

Note that in recent versions (>= 4.07), it is easier to turn Hashtbls to list or seq, using the to_seq functions.

let count_unique_elements_hashtbl list =
  let counter = Hashtbl.create 10000 in
  let update_counter x = 
    if Hashtbl.mem counter x then
      let current_count = Hashtbl.find counter x in
      Hashtbl.replace counter x (succ current_count)
    else
      Hashtbl.replace counter x 1
      in
  List.iter update_counter list;
  Hashtbl.to_seq counter
  |> List.of_seq

Using integer hash tables

As stated in the documentation:

The functorial interface allows the use of specific comparison and hash functions, either for performance/security concerns, or because keys are not hashable/comparable with the polymorphic builtins.

Performing a minor replacement in the code above, we can rewrite this as:

module IntHash =
  struct
    type t = int
        let equal i j = i=j
        let hash i = i land max_int
  end


(* Here we create a specialized module for hash tables whose key is an integer *)
module IntHashtbl = Hashtbl.Make(IntHash)


let count_unique_elements_int_hashtbl list =
  let counter = IntHashtbl.create 10000 in
  let update_counter x = 
    if IntHashtbl.mem counter x then
      let current_count = IntHashtbl.find counter x in
      IntHashtbl.replace counter x (succ current_count)
    else
      IntHashtbl.replace counter x 1
  in
  List.iter update_counter list;
  IntHashtbl.to_seq counter
  |> List.of_seq

Benchmark

And the winner is count_unique_elements_int_hashtbl.

$ ocamlopt main.ml 
$ ./a.out 
count_unique_elements_naive        Execution time: 0.372140s
count_unique_elements_hashtbl      Execution time: 0.089627s
count_unique_elements_int_hashtbl  Execution time: 0.081218s

If one wants to reproduce the results:

let print_int_pair_list list =
  let string_of_int_pair pair = match pair with
  | a,b -> (string_of_int a)^" - "^(string_of_int b)^"\n" in
  List.map string_of_int_pair list 
  |> List.iter print_string


let time f x =
  let t = Sys.time() in
  let fx = f x in
  Printf.printf "Execution time: %fs\n" (Sys.time() -. t);
  fx


let count_unique_elements_naive list = 
  let count_element e list = List.filter (fun x -> x = e) list |> List.length in
  List.sort_uniq Int.compare list 
  |> List.map (fun e -> (e, count_element e list))


let count_unique_elements_hashtbl list =
  let counter = Hashtbl.create 10000 in
  let update_counter x = 
    if Hashtbl.mem counter x then
      let current_count = Hashtbl.find counter x in
      Hashtbl.replace counter x (succ current_count)
    else
      Hashtbl.replace counter x 1
      in
  List.iter update_counter list;
  Hashtbl.to_seq counter
  |> List.of_seq


module IntHash =
  struct
    type t = int
        let equal i j = i=j
        let hash i = i land max_int
  end


(* Using the Hashtbl functorial interface *)
module IntHashtbl = Hashtbl.Make(IntHash)


let count_unique_elements_int_hashtbl list =
  let counter = IntHashtbl.create 10000 in
  let update_counter x = 
    if IntHashtbl.mem counter x then
      let current_count = IntHashtbl.find counter x in
      IntHashtbl.replace counter x (succ current_count)
    else
      IntHashtbl.replace counter x 1
  in
  List.iter update_counter list;
  IntHashtbl.to_seq counter
  |> List.of_seq


let sample = List.init 1000000 (fun _ -> Random.int 10)


let named_methods = [("count_unique_elements_naive", count_unique_elements_naive)
  ;("count_unique_elements_hashtbl", count_unique_elements_hashtbl)
  ;("count_unique_elements_int_hashtbl", count_unique_elements_int_hashtbl)]

let () =
  let _ = List.map (fun pair -> print_string (fst pair); print_string "\t";time (snd pair) sample) named_methods in
  ()

Books

Real World OCaml: Functional programming for the masses by Yaron Minsky, Anil Madhavapeddy and Jason Hickey is a good introduction to OCaml.

Purely Functional Data Structures by Chris Okasaki, which presents another way to look at data structures, from a functional point of view. As a prerequisite, a knowledge of common structures such as Heaps, Linked Lists is needed.

How to optimize PyTorch code ?

Optimizing some deep learning code may seem quite complicated. After all, [PyTorch](https://pytorch.org/) is already super optimized so w...… Continue reading

Acronyms of deep learning

Published on March 10, 2024

AI with OCaml : the tic tac toe game

Published on September 24, 2023