Some ways to do set comparision

In Tcl Gems Michael Schlenker saw a proc by Ed Suominen to do list comparision (set comparision would be more precise). This is related to Manipulating sets in tcl.

As setok said it seemed suboptimal, so i did some tests with alternatives:

So here are the rivals:

Ed's original (it does return incorrect results...):

 proc listcomp1 { list1 list2 out1Name out2Name } {

   ### Define empty lists in case one has no unique elements
   set out1 {}; set out2 {}

   ### Test each element of each list against all elements of other list

   foreach {i} $list1 {j} $list2 {

     # First, test for unique element in list1
     if { [ lsearch  -exact $list2 $i ] < 0 } { lappend out1 $i }
     # Then test for unique element in list2
     if { [ lsearch  -exact $list1 $j ] < 0 } { lappend out2 $j }
   }

   ### Put results in specified lists
   upvar $out1Name x
   set x $out1
   upvar $out2Name x
   set x $out2

   ### END LISTCOMP
   return
 }

My first try with presorted lists (this returns junk too, like eds):

 proc listcomp2 { list1 list2 out1Name out2Name } {

   set out1 {}; set out2 {}

   set list1 [lsort -increasing [K $list1 [set list1 ""]]]
   set list2 [lsort -increasing [K $list2 [set list2 ""]]]
   foreach {i} $list1 {j} $list2 {

     if { [ lsearch  -sorted -exact $list2 $i ] < 0 } { lappend out1 $i }
     if { [ lsearch  -sorted -exact $list1 $j ] < 0 } { lappend out2 $j }
   }

   upvar #0 $out1Name x
   set x $out1
   upvar #0 $out2Name x
   set x $out2

   return
 }

My second try with an array (this works correct):

 proc listcomp3 { list1 list2 out1Name out2Name } {
      
   set x [list]
   set y [list]
   
   # cache boolean representation in 0val and 1val
   set 0val [expr {0!=0}]
   set 1val [expr {1==1}]
   
   foreach item $list1 {
        set A($item) $0val
   }

   foreach item $list2 {
        if {[info exists A($item)]} {
            unset A($item)
        } else {
            set A($item) $1val
        }
   }
   foreach key [array names A] {
        if {$A($key)} {
            lappend x $key
        } else {
            lappend y $key
        }
   }

   upvar $out1Name B
   set B $x
   upvar $out2Name C
   set C $y
   
   return
 }

My third try with a while loop (this works correct too):

 proc listcomp4 {list1 list2 outVar1 outVar2} { 
    
    set A [list]
    set B [list]        

    set i 0
    set j 0
    
    set list1 [lsort $list1]
    set list2 [lsort $list2]
    set l1 [llength $list1]
    set l2 [llength $list2]
        
    while {($i < $l1) && ($j < $l2)} {
        if {[set w [string compare [lindex $list1 $i] [lindex $list2 $j]]] == 0} {
            # equal
            incr i
            incr j
        } else {
            if {$w < 0} {
                # list1 < list2
                lappend A [lindex $list1 $i]
                incr i
            } else {
                lappend B [lindex $list2 $j]
                incr j
            }
        }
    }
    
    if {$i < $l1} {
        #not finished with list1 yet
        #add the remaining parts to A as unique
        set A [concat $A [lrange $list1 $i end]]
        # B is complete
    } else {
        if {$j < $l2} {
          #not finished list2 yet
          set B [concat $B [lrange $list2 $j end]]        
        }
    }   
    
    upvar $outVar1 x
    set x $A
    upvar $outVar2 y
    set y $B
    return
 }

Now the simple benchmark suite:

 proc K {x y} {set x}

 proc buildlist {n m} {
    set result [list]
    for {set i 0} {$i < $n} {incr i} {
        if {$i % $m} {
            lappend result $i
        }
    }
    return $result
 }

 proc buildlistR {n m} {
    set result [list]
    incr n -1
    for {set i $n} {$i > 0} {incr i -1} {
        if {$i % $m} {
            lappend result $i
        }
    }
    return $result
 }

 proc shuffle6 { list } {
     set n [llength $list]
     for { set i 1 } { $i < $n } { incr i } {
         set j [expr { int( rand() * $n ) }]
         set temp [lindex $list $i]
         lset list $i [lindex $list $j]
         lset list $j $temp
     }
     return $list
 }

 proc runTest {proc l1 l2 iter} {
    set A [list]
    set B [list]
    set t [time {$proc $l1 $l2 A B} $iter]
    return $t
 } 
 
 proc timetest {size modulo1 modulo2 iter} {
    puts "Modulus set to $modulo1 / $modulo2, size set to $size, $iter iterations"

    set l1 [buildlist $size $modulo1]
    set l2 [buildlist $size $modulo2]

    puts "Running with increasing sorted lists [llength $l1], [llength $l2] items"
    
    puts "----------------------------------------------------"
    foreach p [list listcomp1 listcomp2 listcomp3 listcomp4] {
         puts "$p\t: [runTest $p $l1 $l2 $iter]"
    }
    puts "----------------------------------------------------\n"

    set l1 [buildlistR $size $modulo1]
    set l2 [buildlistR $size $modulo2]

    puts "Running with decreasing sorted lists [llength $l1], [llength $l2] items"
    
    puts "----------------------------------------------------"
    foreach p [list listcomp1 listcomp2 listcomp3 listcomp4] {
         puts "$p\t: [runTest $p $l1 $l2 $iter]"
    }
    puts "----------------------------------------------------\n"

    set l1 [shuffle6 $l1]
    set l2 [shuffle6 $l2]

    puts "Running with shuffled lists [llength $l1], [llength $l2] items"
    
    puts "----------------------------------------------------"
    foreach p [list listcomp1 listcomp2 listcomp3 listcomp4] {
         puts "$p\t: [runTest $p $l1 $l2 $iter]"
    }
    puts "----------------------------------------------------\n"
    
 }

Timings done on my PII 350, W2k, ActiveTcl 8.4.2.0.

 % timetest 1000 2 3 100

 Modulus set to 2 / 3, size set to 1000, 100 iterations
 Running with increasing sorted lists 500, 666 items
 ----------------------------------------------------
 listcomp1        : 179145 microseconds per iteration
 listcomp2        : 186705 microseconds per iteration
 listcomp3        : 23429 microseconds per iteration
 listcomp4        : 12538 microseconds per iteration
 ----------------------------------------------------

 Running with decreasing sorted lists 500, 666 items
 ----------------------------------------------------
 listcomp1        : 178605 microseconds per iteration
 listcomp2        : 185959 microseconds per iteration
 listcomp3        : 23441 microseconds per iteration
 listcomp4        : 12320 microseconds per iteration
 ----------------------------------------------------

 Running with shuffled lists 500, 666 items
 ----------------------------------------------------
 listcomp1        : 184550 microseconds per iteration
 listcomp2        : 187540 microseconds per iteration
 listcomp3        : 23535 microseconds per iteration
 listcomp4        : 13679 microseconds per iteration
 ----------------------------------------------------