Sunday, January 20, 2008

Huffman Coding in F#

#light // I.e. lightweight syntax

namespace Nassar
// An F# module is a "static class." All the declarations
// that follow are static members of this class.

module Huffman

type Incidence = HashMultiMap<char, int>
type CharSet = Set<char>

let addToCount (d : Incidence) (c : char) =
// For some reason, the "do" is necessary here.
do match d.TryFind c with
Some(n) -> d.Replace(c, n + 1)
_ -> d.Add(c, 1)
d // Return the dictionary.

let count (s : string) =
// I only just noticed that System.String implements
// IEnumerable. The F# compiler correctly inferred from
// the type of Incidence that s has to be a sequence of
// char.

s |> Seq.fold addToCount (Incidence.Create())

type HuffmanNode =
Leaf of HuffmanLeaf
Tree of HuffmanTree
and HuffmanLeaf = {Count: int;
Char: char;}
and HuffmanTree = {Count: int;
Chars: CharSet;
Left: HuffmanNode;
Right: HuffmanNode;}

let get_count node =
match node with
Leaf l -> l.Count
Tree t -> t.Count

let has_char node c =
match node with
Leaf l -> l.Char = c
Tree t -> t.Chars.Contains c

let get_set node =
match node with
Leaf leaf -> CharSet.Singleton leaf.Char
Tree tree -> tree.Chars

let merge_nodes left right =
Tree { Count = get_count left + get_count right;
Chars = (get_set left) + (get_set right);
Left = left;
Right = right }

let node_priority l r =
get_count l - get_count r

let rec reduce_list l =
// What I should really do is put all the new leaf nodes
// into a priority queue, and pull off two at a time as
// long as the length of the queue is > 1. Then
// make_tree wouldn't have to convert the sequence
// it creates from the dictionary into a list.

match l with
l::r::tail -> (merge_nodes l r)::tail
|> List.sort node_priority
// ...and recurse.
|> reduce_list
[tree] -> tree
[] -> failwith "Can't operate on empty list!"

let rec encode h s =
// Encode a character by branching to the leaf node
// with that character.

let rec encode' node c =
match node with
Leaf l -> []
// TODO Add an accumulator to make this tail-recursive,
// rather than prepending the new element.

Tree t -> if has_char t.Left c then 0::(encode' t.Left c) else 1::(encode' t.Right c)
s > (fun c -> encode' h c) > Seq.concat

// Decode the int list l using the tree h.
let rec decode h l =
let rec decode' node l =
match node, l with
// If we've arrived at a leaf, this must be the character
// we're looking for. Add it to the sequence, and resume
// from the top of the tree.
// TODO Add an accumulator to make this tail-recursive.

Leaf leaf, _ -> leaf.Char :: (decode h l)
// A 0 in the stream means a left branch during
// the encoding, and v.v.

Tree tree, n::tail -> if n = 0 then decode' tree.Left tail else decode' tree.Right tail
// Something was wrong in the encoding.
_, _ -> failwith "Something has gone very wrong."
match l with
[] -> []
// Start navigating down the tree.
_ -> decode' h l

let make_tree s =
let d = count s
// IDictionary<_, _> implements
// ICollection<KeyValuePair<_,_>>, so enumerate the
// pairs and convert each into a leaf node.

let leaves = d |> (fun pair ->
Leaf { Count = pair.Value;
Char = pair.Key }; )
|> Seq.to_list
|> List.sort node_priority
reduce_list leaves

No comments: