A Hashing Trie in Tcl

George Peter Staplin Nov 28, 2008 - A hashing trie is a type of trie. They are much more scalable than hash tables, though they use a lot more memory. When I originally created my first trie (in C), I called it a pig structure, as I was unaware of the trie data structure.

There are now 2 versions below. I have added a trie-unset and support for empty values.

Caveat: the trie-get could be done with a single lindex rather than the foreach loop. Removing unused paths in the trie might be beneficial at some point, beyond just trie-unset.


 #By George Peter Staplin
 #
 #Revision 2
 #This is a simple hashing trie in Tcl.
 #A trie uses the actual values of a key for the path to the data.
 #This uses 4 bits per path for 16 directions per node.
 #
 #A would generate a path of: 1 4
 #AB would generate a path of: 1 4 2 4
 
 package require Tcl 8.5 ;#uses {*}
 
 
 proc trie-path str {
     set r [list]
 
     foreach c [split $str ""] {
         set n [scan $c %c]
 
         while {$n > 0} {
             #We use 4 bits for each path, so we have 16 nodes.
             set p [expr {$n & 15}]
 
             #Append this to the path.
             lappend r $p
 
             #Shift 4 bits to the right now that we got 2^4 - 1 with our & earlier.
             set n [expr {$n >> 4}]
         }
     }
 
     return $r
 }
 
 proc trie-create {} {
     #We create 17 list items.  The 17th is for the value of the association.
 
     return [lrepeat 17 [list]]
 }
 
 proc trie-set {tvar key value} {
     upvar 1 $tvar t
 
     set pathlist [trie-path $key]
 
 
     #First build up a path of valid sublists.
     set path [list]
 
     foreach p $pathlist {
         lappend path $p
 
         set subt [lindex $t {*}$path]
         if {![llength $subt]} {
             lset t {*}$path [trie-create]
         }
     }
 
     #We use a 2 element list to differentiate from an empty string value.
     lset t {*}$pathlist 16 [list DATA $value]
 
     #Enable this to see the actual serialized structure of the trie.
     #puts $t
 }
 
 proc trie-get {t key valuevar} {
     upvar 1 $valuevar value
 
     set pathlist [trie-path $key]
 
     set path [list]
     foreach p $pathlist {
         lappend path $p
 
         if {![llength [lindex $t {*}$path]]} {
             #This key's path is not in the trie.
             return 0
         }
     }
 
     set valuelist [lindex $t {*}$pathlist 16]
 
     if {2 != [llength $valuelist]} {
         #The value is not a valid [list DATA $value].
         return 0
     }
 
 
     set value [lindex $valuelist 1]
     #We found the value associated with the key.
     return 1
 }
 
 proc trie-unset {tvar key} {
     upvar 1 $tvar t
 
     set pathlist [trie-path $key]
 
     #Here we rely on lindex returning an empty list for an invalid offset.
     set valuelist [lindex $t {*}$pathlist 16]
         
     if {2 != [llength $valuelist]} {
         #The value is not set, or the key is invalid.
         return
     }
 
     #We have no need for a NULL value, just use an empty list.
     lset t {*}$pathlist [list]
 }
 
 
 proc main {} {
     set t [trie-create]
     
     set key1 "Peachy!"
     set key2 "Harder!  Better!  Faster!  Stronger!"
     set key3 "words of wisdom"
     
     trie-set t $key1 123 
 
     if {[trie-get $t $key1 value]} {
         puts "$value is associated with $key1"
     }
 
     if {[trie-get $t "invalid" value]} {
         puts stderr "err invalid key generated: $value"
     }
 
     trie-set t $key2 456
 
     if {[trie-get $t $key2 value]} {
         puts "$value is associated with $key2"
     }
 
     trie-unset t $key2
 
     if {[trie-get $t $key2 value]} {
         puts stderr "err got $value for an unset key!"
     }
     
     
     puts $t
 
     trie-set t $key3 [list "Don't bite off more than you can chew." \
                           "The cost of wisdom is pain."]
 
     puts $t
 
     if {[trie-get $t $key3 value]} {
         puts "'$key3' is associated with the value: $value"
     }
 }
 main

 #By George Peter Staplin
 #
 #This is a simple hashing trie in Tcl.
 #A trie uses the actual values of a key for the path to the data.
 #This uses 4 bits per path for 16 directions per node.
 #
 #A would generate a path of: 1 4
 #AB would generate a path of: 1 4 2 4
 
 package require Tcl 8.5 ;#uses {*}
 
 
 proc trie-path str {
     set r [list]
 
     foreach c [split $str ""] {
         set n [scan $c %c]
 
         while {$n > 0} {
             #We use 4 bits for each path, so we have 16 nodes.
             set p [expr {$n & 15}]
 
             #Append this to the path.
             lappend r $p
 
             #Shift 4 bits to the right now that we got 2^4 - 1 with our & earlier.
             set n [expr {$n >> 4}]
         }
     }
 
     return $r
 }
 
 proc trie-create {} {
     #We create 17 list items.  The 17th is for the value of the association.
 
     return [lrepeat 17 [list]]
 }
 
 proc trie-set {tvar key value} {
     upvar 1 $tvar t
 
     set pathlist [trie-path $key]
 
 
     #First build up a path of valid sublists.
     set path [list]
 
     foreach p $pathlist {
         lappend path $p
 
         set subt [lindex $t {*}$path]
         if {![llength $subt]} {
             lset t {*}$path [trie-create]
         }
     }
 
     lset t {*}$pathlist 16 $value
 
     #Enable this to see the actual serialized structure of the trie.
     #puts $t
 }
 
 proc trie-get {t key valuevar} {
     upvar 1 $valuevar value
 
     set pathlist [trie-path $key]
 
     set path [list]
     foreach p $pathlist {
         lappend path $p
 
         if {![llength [lindex $t {*}$path]]} {
             #This key's path is not in the trie.
             return 0
         }
     }
 
     set value [lindex $t {*}$pathlist 16]
     #We found the value associated with the key.
     return 1
 }
 
 proc main {} {
     set t [trie-create]
     
     set key1 "Peachy!"
     set key2 "Harder!  Better!  Faster!  Stronger!"
 
     trie-set t $key1 123 
 
     if {[trie-get $t $key1 value]} {
         puts "$value is associated with $key1"
     }
 
     if {[trie-get $t "invalid" value]} {
         puts "err invalid key generated: $value"
     }
 
     trie-set t $key2 456
 
     if {[trie-get $t $key2 value]} {
         puts "$value is associated with $key2"
     }
     
 }
 main

kruzalex With changes allowing run it under Tcl 8.4 and without forcing 17th as the value of the association see the result after running the code below:

  proc lrepeat {value number} {
        set res [list]
    for {set i 0} {$i<$number} {incr i} {
        lappend res $value
    }
    set res
 } ;# RS

 proc trie-create {} {
    return [lrepeat [list] 17]
 }

 proc trie-set {tvar key value} {
     upvar 1 $tvar t

     set pathlist [trie-path $key]

     foreach p $pathlist {
         lappend path $p
         set subt [lindex $t $path]
         if {![llength $subt]} {
             lset t $path [trie-create]
             
         }
     }
     lset t $pathlist $value
 }
 
 proc trie-get {t key} {
     #upvar 1 $valuevar value

     set pathlist [trie-path $key]

     set path [list]
     foreach p $pathlist {
         lappend path $p
         if {![llength [lindex $t $path]]} {
             return 0
         }

     }
     set value [lindex $t $pathlist]
     return $value
 }
 
 proc main {} {
     set t [trie-create]

     set key "A"
     trie-set t $key result

     #if {[trie-get $t $key value]} {
     #   puts "$value is associated with $key"
     #}
     puts [trie-get $t $key]
 }
 main

28/01/10 CMcC has written another trie implementation ... no reflection on the foregoing. This one might be a Patricia Trie.

  # Trie data structure
  
  package provide Trie 1.0
  if {[catch {package require Debug}]} {
      proc Debug.trie {args} {
          #puts stderr [uplevel subst $args]
      }
  }
  
  oo::class create ::Trie {
      variable trie id
  
      # search for longest prefix, return matching prefix, element and suffix
      method matches {t what} {
          set matches {}
          set wlen [string length $what]
          foreach k [lsort -decreasing -dictionary [dict keys $t]] {
              set klen [string length $k]
              set match ""
              for {set i 0} {$i < $klen
                             && $i < $wlen
                             && [string index $k $i] eq [string index $what $i]
                         } {incr i} {
                  append match [string index $k $i]
              }
              if {$match ne ""} {
                  lappend matches $match $k
              }
          }
          Debug.trie {matches: $what -> $matches}
  
          if {[dict size $matches]} {
              # find the longest matching prefix
              set match [lindex [lsort -dictionary [dict keys $matches]] end]
              set mel [dict get $matches $match]
              set suffix [string range $what [string length $match] end]
              
              return [list $match $mel $suffix]
          } else {
              return {}        ;# no matches
          }
      }
  
      # return next unique id if there's no proffered value
      method id {value} {
          if {$value} {
              return $value
          } else {
              return [incr id]
          }
      }
  
      # insert an element with a given optional value into trie
      # along path given by $args (no need to specify)
      method insert {what {value 0} args} {
          if {[llength $args]} {
              set t [dict get $trie {*}$args]
          } else {
              set t $trie
          }
  
          if {[dict exists $t $what]} {
              Debug.trie {$what is an exact match on path ($args $what)}
              if {[catch {dict size [dict get $trie {*}$args $what]} size]} {
                  # the match is a leaf - we're done
              } else {
                  # the match is a dict - we have to add a null
                  dict set trie {*}$args $what "" [my id $value]
              }
  
              return        ;# exact match - no change
          }
  
          # search for longest prefix
          set match [my matches $t $what]
  
          if {![llength $match]} {
              ;# no matching prefix - new element
              Debug.trie {no matching prefix of '$what' in $t - add it on path ($args $what)}
              dict set trie {*}$args $what [my id $value]
              return
          }
  
          lassign $match match mel suffix        ;# prefix, element of match, suffix
  
          if {$match ne $mel} {
              # the matching element shares a prefix, but has a variant suffix
              # it must be split
              Debug.trie {splitting '$mel' along '$match'}
              set melC [dict get $t $mel]
              dict unset trie {*}$args $mel
              dict set trie {*}$args $match [string range $mel [string length $match] end] $melC
          }
  
          if {[catch {dict size [dict get $trie {*}$args $match]} size]} {
              # the match is a leaf - must be split
              if {$match eq $mel} {
                  # the matching element shares a prefix, but has a variant suffix
                  # it must be split
                  Debug.trie {splitting '$mel' along '$match'}
                  set melC [dict get $t $mel]
                  dict unset trie {*}$args $mel
                  dict set trie {*}$args $match "" $melC
              }
              Debug.trie {'$mel' is the longest prefix '$match' but was a leaf - insert '$suffix'}
              set melid [dict get $t $mel]
              dict set trie {*}$args $match $suffix [my id $value]
          } else {
              # it's a dict - keep searching
              Debug.trie {'$mel' is the longest prefix '$match' and is a dict - search for '$suffix' on path ($args $match)}
              my insert $suffix $value {*}$args $match
          }
      }
  
      # find a path matching an element $what
      # if the element's not found, return the nearest path
      method find_path {what args} {
          if {[llength $args]} {
              set t [dict get $trie {*}$args]
          } else {
              set t $trie
          }
          
          if {[dict exists $t $what]} {
              Debug.trie {$what is an exact match on path ($args $what)}
              return [list {*}$args $what]        ;# exact match - no change
          }
  
          # search for longest prefix
          set match [my matches $t $what]
  
          if {![llength $match]} {
              return $args
          }
  
          lassign $match match mel suffix        ;# prefix, element of match, suffix
  
          if {$match ne $mel} {
              # the matching element shares a prefix, but has a variant suffix
              # no match
              return $args
          }
  
          if {[catch {dict size [dict get $trie {*}$args $match]} size] || $size == 0} {
              # got to a non-matching leaf - no match
              return $args
          } else {
              # it's a dict - keep searching
              Debug.trie {'$mel' is the longest prefix '$match' and is a dict - search for '$suffix' on path ($args $match)}
              return [my find_path $suffix {*}$args $match]
          }
      }
  
      # given a trie, which may have been modified by deletion,
      # optimize it by removing empty nodes and coalescing singleton nodes
      method optimize {args} {
          if {[llength $args]} {
              set t [dict get $trie {*}$args]
          } else {
              set t $trie
          }
  
          if {[catch {dict size $t} size]} {
              Debug.trie {optimize leaf '$t' along '$args'}
              # leaf - leave it
          } else {
              switch -- $size {
                  0 {
                      Debug.trie {optimize empty dict ($t) along '$args'}
                      if {[llength $args]} {
                          dict unset trie {*}$args
                      }
                  }
                  1 {
                      Debug.trie {optimize singleton dict ($t) along '$args'}
                      lassign $t k v
                      if {[llength $args]} {
                          dict unset trie {*}$args
                      }
                      append args $k
                      if {[llength $v]} {
                          dict set trie {*}$args $v
                      }
                      my optimize {*}$args
                  }
                  default {
                      Debug.trie {optimize dict ($t) along '$args'}
                      dict for {k v} $t {
                          my optimize {*}$args $k
                      }
                  }
              }
          }
      }
  
      # delete element $what from trie
      method delete {what} {
          set path [my find_path $what]
          if {[join $path ""] eq $what} {
              Debug.trie {del '$what' along ($path) was [dict get $trie {*}$path]}
              if {[catch {dict size [dict get $trie {*}$path]} size]} {
                  # got to a matching leaf - delete it
                  dict unset trie {*}$path
                  set path [lrange $path 0 end-1]
              } else {
                  dict unset trie {*}$path ""
              }
  
              my optimize        ;# remove empty and singleton elements
          } else {
              # nothing to delete, guess we're done
          }
      }
  
      # find the value of element $what in trie,
      # error if not found
      method find {what} {
          set path [my find_path $what]
          if {[join $path ""] eq $what} {
              if {[catch {dict size [dict get $trie {*}$path]} size]} {
                  # got to a matching leaf - done
                  return [dict get $trie {*}$path]
              } else {
                  return [dict get $trie {*}$path ""]
              }
          } else {
              error "'$what' not found"
          }
      }
  
      # dump the trie as a string
      method dump {} {
          return $trie
      }
  
      # return a string rep of the trie sorted in dict order
      method order {{t {}}} {
          if {![llength $t]} {
              set t $trie
          } elseif {[llength $t] == 1} {
              return $t
          }
          set acc {}
  
          foreach key [lsort -dictionary [dict keys $t]] {
              lappend acc $key [my order [dict get $t $key]]
          }
          return $acc
      }
  
      # return the trie as a dict of names with values
      method flatten {{t {}} {prefix ""}} {
          if {![llength $t]} {
              set t $trie
          } elseif {[llength $t] == 1} {
              return [list $prefix $t]
          }
  
          set acc {}
  
          foreach key [dict keys $t] {
              lappend acc {*}[my flatten [dict get $t $key] $prefix$key]
          }
          return $acc
      }
  
      # overwrite the trie
      method set {t} {
          set trie $t
      }
  
      constructor {args} {
          set trie {}
          set id 0
          foreach a $args {
              my insert $a
          }
      }
  }
  
  if {[info script] eq $argv0} {
      set data {
          rubber        romane        eunt        domus        romanus        romulus
          rubens        ruber        rube        rubicon        rubicundus        roman
          an        antidote        anecdotal        ant        all
          alloy        allotrope        allot        aloe        are        ate
          be cataract catatonic catenary
      }
      ::Trie create example {*}$data
      puts "TRIE: [example dump]"
      puts "OTRIE: [example order]"
      example set [example order]
      puts "FLAT: [example flatten]"
      foreach d $data {
          puts "$d -> '[example find_path $d]' -> [example find $d]"
      }
  
      foreach d $data {
          example delete $d
          puts "DEL '$d': [example dump]"
      }
  
      puts "TRIE: [example dump]"
  }