Count different elements in list - Ocaml

Reading time ~4 minutes

It is quite common to have to count the occurences of each element in a list. Python would propose the Counter class, per example, and looking around, one would soon enough find the following snippet :

from collections import Counter

words = ['a', 'b', 'c', 'a']

Counter(words).keys() # equals to list(set(words))
Counter(words).values() # counts the elements' frequency

However, OCaml lacks the abundance of snippets to find here and there. This article will focus on counting the occurences of each element in a list, using various approaches.

A functional approach

A simple way relying on built-in List module would be to use sort_unique and count the element appearing in the list for each element returned by sort_unique.

let count_unique_elements_naive list = 
  let count_element e list = List.filter (fun x -> x = e) list |> List.length in
  List.sort_uniq Int.compare list 
  |> List.map (fun e -> (e, count_element e list))

The following could be put in a count.ml file in order to test the previous snippet:

let print_int_pair_list list =
  let string_of_int_pair pair = match pair with
  | a,b -> (string_of_int a)^" - "^(string_of_int b)^"\n" in
  List.map string_of_int_pair list 
  |> List.iter print_string

let count_unique_elements_naive list = 
  let count_element e list = List.filter (fun x -> x = e) list |> List.length in
  List.sort_uniq Int.compare list 
  |> List.map (fun e -> (e, count_element e list))


let () =
  count_unique_elements_naive [1; 1; 3; 5; 7; 7; 1]
  |> print_int_pair_list

Running ocaml count.ml would output the following:

1 - 3
3 - 1
5 - 1
7 - 2

Obviously, this method makes as many passes on the list as the number of unique elements in the list, which is a waste of time.

Using Hash tables

Here hash tables comes-in handy. Updating a counter for each element of the list, one can achieve the same results in a single pass over the data.

Note that in recent versions (>= 4.07), it is easier to turn Hashtbls to list or seq, using the to_seq functions.

let count_unique_elements_hashtbl list =
  let counter = Hashtbl.create 10000 in
  let update_counter x = 
    if Hashtbl.mem counter x then
      let current_count = Hashtbl.find counter x in
      Hashtbl.replace counter x (succ current_count)
    else
      Hashtbl.replace counter x 1
      in
  List.iter update_counter list;
  Hashtbl.to_seq counter
  |> List.of_seq

Using integer hash tables

As stated in the documentation:

The functorial interface allows the use of specific comparison and hash functions, either for performance/security concerns, or because keys are not hashable/comparable with the polymorphic builtins.

Performing a minor replacement in the code above, we can rewrite this as:

module IntHash =
  struct
    type t = int
        let equal i j = i=j
        let hash i = i land max_int
  end


(* Here we create a specialized module for hash tables whose key is an integer *)
module IntHashtbl = Hashtbl.Make(IntHash)


let count_unique_elements_int_hashtbl list =
  let counter = IntHashtbl.create 10000 in
  let update_counter x = 
    if IntHashtbl.mem counter x then
      let current_count = IntHashtbl.find counter x in
      IntHashtbl.replace counter x (succ current_count)
    else
      IntHashtbl.replace counter x 1
  in
  List.iter update_counter list;
  IntHashtbl.to_seq counter
  |> List.of_seq

Benchmark

And the winner is count_unique_elements_int_hashtbl.

$ ocamlopt main.ml 
$ ./a.out 
count_unique_elements_naive        Execution time: 0.372140s
count_unique_elements_hashtbl      Execution time: 0.089627s
count_unique_elements_int_hashtbl  Execution time: 0.081218s

If one wants to reproduce the results:

let print_int_pair_list list =
  let string_of_int_pair pair = match pair with
  | a,b -> (string_of_int a)^" - "^(string_of_int b)^"\n" in
  List.map string_of_int_pair list 
  |> List.iter print_string


let time f x =
  let t = Sys.time() in
  let fx = f x in
  Printf.printf "Execution time: %fs\n" (Sys.time() -. t);
  fx


let count_unique_elements_naive list = 
  let count_element e list = List.filter (fun x -> x = e) list |> List.length in
  List.sort_uniq Int.compare list 
  |> List.map (fun e -> (e, count_element e list))


let count_unique_elements_hashtbl list =
  let counter = Hashtbl.create 10000 in
  let update_counter x = 
    if Hashtbl.mem counter x then
      let current_count = Hashtbl.find counter x in
      Hashtbl.replace counter x (succ current_count)
    else
      Hashtbl.replace counter x 1
      in
  List.iter update_counter list;
  Hashtbl.to_seq counter
  |> List.of_seq


module IntHash =
  struct
    type t = int
        let equal i j = i=j
        let hash i = i land max_int
  end


(* Using the Hashtbl functorial interface *)
module IntHashtbl = Hashtbl.Make(IntHash)


let count_unique_elements_int_hashtbl list =
  let counter = IntHashtbl.create 10000 in
  let update_counter x = 
    if IntHashtbl.mem counter x then
      let current_count = IntHashtbl.find counter x in
      IntHashtbl.replace counter x (succ current_count)
    else
      IntHashtbl.replace counter x 1
  in
  List.iter update_counter list;
  IntHashtbl.to_seq counter
  |> List.of_seq


let sample = List.init 1000000 (fun _ -> Random.int 10)


let named_methods = [("count_unique_elements_naive", count_unique_elements_naive)
  ;("count_unique_elements_hashtbl", count_unique_elements_hashtbl)
  ;("count_unique_elements_int_hashtbl", count_unique_elements_int_hashtbl)]

let () =
  let _ = List.map (fun pair -> print_string (fst pair); print_string "\t";time (snd pair) sample) named_methods in
  ()

Books

Real World OCaml: Functional programming for the masses by Yaron Minsky, Anil Madhavapeddy and Jason Hickey is a good introduction to OCaml.

Purely Functional Data Structures by Chris Okasaki, which presents another way to look at data structures, from a functional point of view. As a prerequisite, a knowledge of common structures such as Heaps, Linked Lists is needed.

Python plot 3d scatter and density

It is often easy to compare, in dimension one, an histogram and the underlying density. This is quite useful when one want to visually ev...… Continue reading

Best casual readings in mathematics

Published on May 03, 2020

LightGBM on the GPU

Published on April 29, 2020