Version 0 of Longest common subsequence: folklore algorithm

Updated 2003-03-14 22:45:14

if 0 { KBK: The longest common subsequence problem is:

Given two lists L1 and L2, compute the largest set of ordered pairs

    {x1,y1), (x2,y2), ..., (xn, yn)

such that

    x1 < x2 < ... < xn
    y1 < y2 < ... < yn

and

    L1[x1] = L2[y1] ; L1[x2] = L2[y2] ; ... L1[xn] = L2[yn]

Solving this problem is critical to the implementation of diff in Tcl.

Here is one Tcl implementation of the "folklore algorithm."

Note that Hunt and McIlroy have published a much better algorithm (used in the Unix 'diff' command) that is implemented at diff in Tcl. This page is here to hold some of the historical discussion.


}

 namespace eval list {
 }

 namespace eval list::longestCommonSubsequence {
     namespace export compare
 }

 # Internal procedure that indexes into the 2-dimensional array t,
 # which corresponds to the sequence y, looking for the (i,j)th element.

 proc list::longestCommonSubsequence::Index { t y i j } {
     set indx [expr { ([llength $y] + 1) * ($i + 1) + ($j + 1) }]
     return [lindex $t $indx]
 }

 # Internal procedure that implements Levenshtein to derive the longest
 # common subsequence of two lists x and y.

 proc list::longestCommonSubsequence::ComputeLCS { x y } {
     set t [list]
     for { set i -1 } { $i < [llength $y] } { incr i } {
         lappend t 0
     }
     for { set i 0 } { $i < [llength $x] } { incr i } {
         lappend t 0
         for { set j 0 } { $j < [llength $y] } { incr j } {
             if { [string equal [lindex $x $i] [lindex $y $j]] } {
                 set lastT [Index $t $y [expr { $i - 1 }] [expr {$j - 1}]]
                 set nextT [expr {$lastT + 1}]
             } else {
                 set lastT1 [Index $t $y $i [expr { $j - 1 }]]
                 set lastT2 [Index $t $y [expr { $i - 1 }] $j]
                 if { $lastT1 > $lastT2 } {
                     set nextT $lastT1
                 } else {
                     set nextT $lastT2
                 }
             }
             lappend t $nextT
         }
     }
     return $t
 }

 # Internal procedure that traces through the array built by ComputeLCS
 # and finds a longest common subsequence -- specifically, the one that
 # is lexicographically first.

 proc list::longestCommonSubsequence::TraceLCS { t x y } {
     set trace {}
     set i [expr { [llength $x] - 1 }]
     set j [expr { [llength $y] - 1 }]
     set k [expr { [Index $t $y $i $j] - 1 }]
     while { $i >= 0 && $j >= 0 } {
         set im1 [expr { $i - 1 }]
         set jm1 [expr { $j - 1 }]
         if { [Index $t $y $i $j] == [Index $t $y $im1 $jm1] + 1
              && [string equal [lindex $x $i] [lindex $y $j]] } {
             lappend trace xy [list $i $j]
             set i $im1
             set j $jm1
         } elseif { [Index $t $y $im1 $j] > [Index $t $y $i $jm1] } {
             lappend trace x $i
             set i $im1
         } else {
             lappend trace y $j
             set j $jm1
         }
     }
     while { $i >= 0 } {
         lappend trace x $i
         incr i -1
     }
     while { $j >= 0 } {
         lappend trace y $j
         incr j -1
     }
     return $trace
 }

 # list::longestCommonSubsequence::compare --
 #
 #       Compare two lists for the longest common subsequence
 #
 # Arguments:
 #       x, y - Two lists of strings to compare
 #       matched - Callback to execute on matched elements, see below
 #       unmatchedX - Callback to execute on unmatched elements from the
 #                    first list, see below.
 #       unmatchedY - Callback to execute on unmatched elements from the
 #                    second list, see below.
 #
 # Results:
 #       None.
 #
 # Side effects:
 #       Whatever the callbacks do.
 #
 # The 'compare' procedure compares the two lists of strings, x and y.
 # It finds a longest common subsequence between the two.  It then walks
 # the lists in order and makes the following callbacks:
 #
 # For an element that is common to both lists, it appends the index in
 # the first list, the index in the second list, and the string value of
 # the element as three parameters to the 'matched' callback, and executes
 # the result.
 #
 # For an element that is in the first list but not the second, it appends
 # the index in the first list and the string value of the element as two
 # parameters to the 'unmatchedX' callback and executes the result.
 #
 # For an element that is in the second list but not the first, it appends
 # the index in the second list and the string value of the element as two
 # parameters to the 'unmatchedY' callback and executes the result.

 proc list::longestCommonSubsequence::compare { x y
                                                matched
                                                unmatchedX unmatchedY } {
     set t [ComputeLCS $x $y]
     set trace [TraceLCS $t $x $y]
     set i [llength $trace]
     while { $i > 0 } {
         set indices [lindex $trace [incr i -1]]
         set type [lindex $trace [incr i -1]]
         switch -exact -- $type {
             xy {
                 set c $matched
                 eval lappend c $indices
                 lappend c [lindex $x [lindex $indices 0]]
                 uplevel 1 $c
             }
             x {
                 set c $unmatchedX
                 lappend c $indices
                 lappend c [lindex $x $indices]
                 uplevel 1 $c
             }
             y {
                 set c $unmatchedY
                 lappend c $indices
                 lappend c [lindex $y $indices]
                 uplevel 1 $c
             }
         }
     }
     return
 }

# With this code in hand, we can now write the external parts of a diff command. The various options of diff alter how it displays the comparison, but not its fundamental operation. Here's an external wrapper that gives very simple-minded output.

 namespace import list::longestCommonSubsequence::compare

 proc umx { index value } {
     variable lastx
     variable xlines
     append xlines "< " $value \n
     set lastx $index
 }

 proc umy { index value } {
     variable lasty
     variable ylines
     append ylines "> " $value \n
     set lasty $index
 }

 proc matched { index1 index2 value } {
     variable lastx
     variable lasty
     variable xlines
     variable ylines
     if { [info exists lastx] && [info exists lasty] } {
         puts "[expr { $lastx + 1 }],${index1}c[expr {$lasty + 1 }],${index2}"
         puts -nonewline $xlines
         puts "----"
         puts -nonewline $ylines
     } elseif { [info exists lastx] } {
         puts "[expr { $lastx + 1 }],${index1}d${index2}"
         puts -nonewline $xlines
     } elseif { [info exists lasty] } {
         puts  "${index1}a[expr {$lasty + 1 }],${index2}"
         puts -nonewline $ylines
     }
     catch { unset lastx }
     catch { unset xlines }
     catch { unset lasty }
     catch { unset ylines }
 }

 # Really, we should read the first file in like this:
 #    set f0 [open [lindex $argv 0] r]
 #    set x [split [read $f0] \n]
 #    close $f0
 # But I'll just provide some sample lines:

 set x {}
 for { set i 0 } { $i < 10 } { incr i } {
     lappend x a r a d e d a b r a x
 }

 # The second file, too, should be read in like this:
 #    set f1 [open [lindex $argv 1] r]
 #    set y [split [read $f1] \n]
 #    close $f1
 # Once again, I'll just do some sample lines.

 set y {}
 for { set i 0 } { $i < 10 } { incr i } {
     lappend y a b r a c a d a b r a
 }

 compare $x $y matched umx umy
 matched [llength $x] [llength $y] {}

if 0 { }