Version 6 of Longest common subsequence: folklore algorithm

Updated 2022-01-17 09:35:39 by pooryorick

if 0 { KBK, PYK: The longest common subsequence problem is:

Given two lists L1 and L2, compute the largest set of ordered pairs

    {x1,y1), (x2,y2), ..., (xn, yn)

such that

    x1 < x2 < ... < xn
    y1 < y2 < ... < yn

and

    L1[x1] = L2[y1] ; L1[x2] = L2[y2] ; ... L1[xn] = L2[yn]

In contrast with longest common substring, the resulting ordered pairs are not necessarily contiguous. Solving this problem is critical to the implementation of diff in Tcl. Here is one Tcl implementation of the "folklore algorithm."

Hunt and McIlroy have published a much better algorithm (used in the Unix 'diff' command) that is implemented at diff in Tcl. This page is here to hold some of the historical discussion.


}

namespace eval list {}
 
namespace eval list::longestCommonSubsequence {
    namespace export compare
}

# Internal procedure that indexes into the 2-dimensional array t,
# which corresponds to the sequence y, looking for the (i,j)th element.

proc list::longestCommonSubsequence::Index { t y i j } {
    set indx [expr { ([llength $y] + 1) * ($i + 1) + ($j + 1) }]
    return [lindex $t $indx]
}
    
# Internal procedure that implements Levenshtein to derive the longest
# common subsequence of two lists x and y.

proc list::longestCommonSubsequence::ComputeLCS { x y } {
    set t [list]
    for { set i -1 } { $i < [llength $y] } { incr i } {
        lappend t 0
    }
    for { set i 0 } { $i < [llength $x] } { incr i } {
        lappend t 0
        for { set j 0 } { $j < [llength $y] } { incr j } {
            if { [lindex $x $i] eq [lindex $y $j] } {
                set lastT [Index $t $y [expr { $i - 1 }] [expr {$j - 1}]]
                set nextT [expr {$lastT + 1}]
            } else {
                set lastT1 [Index $t $y $i [expr { $j - 1 }]]
                set lastT2 [Index $t $y [expr { $i - 1 }] $j]
                if { $lastT1 > $lastT2 } {
                    set nextT $lastT1
                } else {
                    set nextT $lastT2
                }
            }
            lappend t $nextT
        }
    }
    return $t
}

# Internal procedure that traces through the array built by ComputeLCS
# and finds a longest common subsequence -- specifically, the one that
# is lexicographically first.

proc list::longestCommonSubsequence::TraceLCS { t x y } {
    set trace {}
    set i [expr { [llength $x] - 1 }]
    set j [expr { [llength $y] - 1 }]
    while { $i >= 0 && $j >= 0 } {
        set im1 [expr { $i - 1 }]
        set jm1 [expr { $j - 1 }]
        if { [Index $t $y $i $j] == [Index $t $y $im1 $jm1] + 1
             && [lindex $x $i] eq [lindex $y $j] } {
            lappend trace xy [list $i $j]
            set i $im1
            set j $jm1
        } elseif { [Index $t $y $im1 $j] > [Index $t $y $i $jm1] } {
            lappend trace x $i
            set i $im1
        } else {
            lappend trace y $j
            set j $jm1
        }
    }
    while { $i >= 0 } {
        lappend trace x $i
        incr i -1
    }
    while { $j >= 0 } {
        lappend trace y $j
        incr j -1
    }
    return $trace
}

# list::longestCommonSubsequence::compare --
#
#       Compare two lists for the longest common subsequence
#
# Arguments:
#       x, y - Two lists of strings to compare
#       matched - Callback to execute on matched elements, see below
#       unmatchedX - Callback to execute on unmatched elements from the
#                    first list, see below.
#       unmatchedY - Callback to execute on unmatched elements from the
#                    second list, see below.
#
# Results:
#       None.
#
# Side effects:
#       Whatever the callbacks do.
#
# The 'compare' procedure compares the two lists of strings, x and y.
# It finds a longest common subsequence between the two.  It then walks
# the lists in order and makes the following callbacks:
#
# For an element that is common to both lists, it appends the index in
# the first list, the index in the second list, and the string value of
# the element as three parameters to the 'matched' callback, and executes
# the result.
#
# For an element that is in the first list but not the second, it appends
# the index in the first list and the string value of the element as two
# parameters to the 'unmatchedX' callback and executes the result.
#
# For an element that is in the second list but not the first, it appends
# the index in the second list and the string value of the element as two
# parameters to the 'unmatchedY' callback and executes the result.

proc list::longestCommonSubsequence::compare { x y
                                               matched
                                               unmatchedX unmatchedY } {
    set t [ComputeLCS $x $y]
    set trace [TraceLCS $t $x $y]
    set i [llength $trace]
    while { $i > 0 } {
        set indices [lindex $trace [incr i -1]]
        set type [lindex $trace [incr i -1]]
        switch -exact -- $type {
            xy {
                set c $matched
                eval lappend c $indices
                lappend c [lindex $x [lindex $indices 0]]
                uplevel 1 $c
            }
            x {
                set c $unmatchedX
                lappend c $indices
                lappend c [lindex $x $indices]
                uplevel 1 $c
            }
            y {
                set c $unmatchedY
                lappend c $indices
                lappend c [lindex $y $indices]
                uplevel 1 $c
            }
        }
    }
    return
}

# With this code in hand, we can now write the external parts of a diff command. The various options of diff alter how it displays the comparison, but not its fundamental operation. Here's an external wrapper that gives very simple-minded output.

namespace import list::longestCommonSubsequence::compare

proc umx { index value } {
    variable lastx
    variable xlines
    append xlines "< " $value \n
    set lastx $index
}

proc umy { index value } {
    variable lasty
    variable ylines
    append ylines "> " $value \n
    set lasty $index
}

proc matched { index1 index2 value } {
    variable lastx
    variable lasty
    variable xlines
    variable ylines
    if { [info exists lastx] && [info exists lasty] } {
        puts "[expr { $lastx + 1 }],${index1}c[expr {$lasty + 1 }],${index2}"
        puts -nonewline $xlines
        puts "----"
        puts -nonewline $ylines
    } elseif { [info exists lastx] } {
        puts "[expr { $lastx + 1 }],${index1}d${index2}"
        puts -nonewline $xlines
    } elseif { [info exists lasty] } {
        puts  "${index1}a[expr {$lasty + 1 }],${index2}"
        puts -nonewline $ylines
    }
    catch { unset lastx }
    catch { unset xlines }
    catch { unset lasty }
    catch { unset ylines }
}
        
# Really, we should read the first file in like this:
#    set f0 [open [lindex $argv 0] r]
#    set x [split [read $f0] \n]
#    close $f0
# But I'll just provide some sample lines:

set x {}
for { set i 0 } { $i < 10 } { incr i } {
    lappend x a r a d e d a b r a x
}

# The second file, too, should be read in like this:
#    set f1 [open [lindex $argv 1] r]
#    set y [split [read $f1] \n]
#    close $f1
# Once again, I'll just do some sample lines.

set y {}
for { set i 0 } { $i < 10 } { incr i } {
    lappend y a b r a c a d a b r a
}

compare $x $y matched umx umy
matched [llength $x] [llength $y] {}

if 0 {


}