Adaptive huffman coding has the advantage over static coding that the entire dataset does not need to be known in advance and the dictionary does not need to be transmitted separately from the data. It is also trickier to implement.
It would be wrong to call this implementation adaptive huffman coding because well, it isn't. It is an adaptive encoding, but my tree balancing routines are buggy and I've already spent enough time on this. To make it a true huffman encoding, just fix balance-FGK or balance-V.
The performance is hideously slow — around 200 characters per second on my machine. I think most of this is due to all the tree manipulation.
I intended to mate this code with the code in LZW, but the combination works pretty badly, probably because of my lack of a good idea for encoding the higher-ordered symbols effeciently.
# this uses the tree data structure from tcllib package require struct namespace eval ::huffman { # reverse a string # This is used to store partial words in big-endian order. 'binary # format' will only store bits starting at the msb or lsb of a word, so we # do the formatting little-endian and reverse it proc reverse {str} { set rv {} foreach c [split $str ""] { set rv [linsert $rv 0 $c] } return [join $rv ""] } # some tree helper functions # create a tree and set up symbol->node and node->symbol arrays proc mktree {} { variable trees variable symbols set tree [::struct::tree] set trees(e0) root set symbols(root) e0 $tree set root 0 return $tree } # delete the tree and arrays proc deltree {tree} { variable trees variable symbols $tree destroy unset trees unset symbols } # given a node, find the bit path to that node from the root proc getpath {tree node} { set p {} while {$node != "root"} { append p [$tree index $node] set node [$tree parent $node] } return [reverse $p] } # 'tree swap' doesn't move child trees, so we move the two nodes # separately using this helper proc swap {tree node1 node2} { set np [$tree parent $node1] set ni [$tree index $node1] $tree move [$tree parent $node2] [$tree index $node2] $node1 $tree move $np $ni $node2 } # dump out the tree # for debugging proc dump-tree {tree} { variable symbols $tree walk root -type bfs -order pre -command { if {[info exists symbols(%n)]} {set sym $symbols(%n)} else {set sym "-"} puts "[string repeat " " [%t depth %n]]%n($sym [%t get %n]): [%t children %n]" } } # increment weight of node, rebalance tree proc update-tree {tree node} { balance-FGK $tree $node } # update the node weights and rebalance the tree using algorithm FGK. # not quite right, doesn't always find the correct sibling. proc balance-FGK {tree node} { set cn $node while {$cn != "root"} { set next [$tree parent $cn] $tree set $cn [expr {[$tree get $cn]+1}] set sib [$tree next $cn] if {$sib == ""} { # if we're at the end of a row then the next sibling is the # start of the previous row. # this works if the tree is only 2 deep, otherwise it's really # wrong set sib [$tree previous $next] } if {$sib != "" && [$tree get $cn] > [$tree get $sib]} { swap $tree $cn $sib } set cn $next } # update root node $tree set $cn [expr {[$tree get $cn]+1}] } # update the node weights and rebalance the tree using algorithm V. # pretty much completely wrong proc balance-V {tree node} { # update the weights on the path to the added node set cn $node while {$cn != ""} { $tree set $cn [expr {[$tree get $cn]+1}] set cn [$tree parent $cn] } set pn {} # this is an attempt to walk the tree in 'implicit' order. # it doesn't work. $tree walk root -order post -type bfs -command { # if weight of current node is less than weight of previous node, # then swap if {$pn != "" && [%t get %n] < [%t get $pn]} { swap %t %n $pn } set pn %n } } # encode symbol (char/integer) as bitstring # handle int range 0-2^11 # the range is beyond 0-255 because this is intended to handle a LZW # dictionary rather than just characters. # there's got to be a better way. # symbol is encoded as a length bit (0 = 8 bits, 1 = 11 bits) followed by # the data in big-endian order proc encsym {sym} { set ebs "" ;# encoded bit-string if {$sym < 256} { append ebs 0 set si [binary format c $sym] binary scan $si B8 sv append ebs $sv } else { append ebs 1 set si [binary format s $sym] # format in little-endian then reverse binary scan $si b11 sv append ebs [reverse $sv] } return $ebs } # decode the next symbol in the bitstring starting at index 'ind' using # the given tree # updates the tree # returns the symbol and the new index proc getsym {tree bits ind} { variable symbols set n root set bl [llength $bits] while {![$tree isleaf $n] && $ind < $bl} { set n [lindex [$tree children $n] [lindex $bits $ind]] incr ind } if {$symbols($n) == "e0"} { # new symbol set lb [lindex $bits $ind] ;# length bit incr ind if {$lb == "0"} { set sb [join [lrange $bits $ind [expr {$ind+7}]] ""] incr ind 8 binary scan [binary format B8 $sb] c sym # mask to keep result unsigned set sym [expr {$sym & 0xff}] } else { set sb [join [lrange $bits $ind [expr {$ind+10}]] ""] incr ind 11 # reverse the bitstring and decode as little-endian binary scan [binary format b11 [reverse $sb]] s sym # mask to keep result unsigned set sym [expr {$sym & 0x7ff}] } set symbols($n) $sym } else { set sym $symbols($n) } # add the symbol to the tree and/or update the tree addsym $tree $sym return [list $sym $ind] } # insert the symbol into the tree if it doesn't exist # update the tree and the mapping arrays # returns code for symbol proc addsym {tree sym} { variable trees variable symbols set rs {} if {![info exists trees($sym)]} { # if symbol doesn't exist: split e0 into new node and new e0 append rs [getpath $tree $trees(e0)] [encsym $sym] set n $trees(e0) set trees($sym) [$tree insert $n 0] $tree set $trees($sym) 0 set trees(e0) [$tree insert $n 1] $tree set $trees(e0) 0 set symbols($trees($sym)) $sym set symbols($trees(e0)) e0 } else { # symbol exists, just emit code and update append rs [getpath $tree $trees($sym)] } update-tree $tree $trees($sym) return $rs } # input is a list of integers proc encode {rval} { set tree [mktree] set l [llength $rval] set c 0 foreach rv $rval { append bs [addsym $tree $rv] incr c # progress meter puts -nonewline stderr [format "%3.2f%%\r" [expr {double($c)/$l*100}]] } deltree $tree return $bs } # decode bitstring into list of integers proc decode {bs} { set tree [mktree] set bits [split $bs ""] set ind 0 set ds {} set bl [llength $bits] while {$ind < $bl} { set si [getsym $tree $bits $ind] lappend ds [lindex $si 0] set ind [lindex $si 1] # progress meter puts -nonewline stderr [format "%3.2f%%\r" [expr {double($ind)/$bl*100}]] } return $ds } } # end namespace eval ::huffman # convert string to list proc s2l {str} { set l {} foreach c [split $str ""] { binary scan $c c num lappend l $num } return $l } # convert list to string proc l2s {list} { set s "" foreach num $list { append s [binary format c $num] } return $s } # some testing set st "Hello, world!" # set st [read stdin] puts "input length: [string length $st] bytes" set cst [huffman::encode [s2l $st]] puts "compressed length: [expr {[string length $cst]/8}] (bytes) [format "%3.2f%%" [expr {(1-([string length $cst]/8)/double([string length $st]))*100}]]" set dst [l2s [huffman::decode $cst]] if {[string compare $dst $st] == 0} { puts "input and output match" } else { puts "input and output differ" }