Longest common word prefix

dbohdan 2018-03-26: The following module lets you find the longest common word prefix of two strings, which is to say, the first N words the strings have in common. For our purposes words are defined as string fragments separated by separators. A separator is a string that matches a given regular expression understood by regexp.

See also

Code

Download with wiki-reaper: wiki-reaper -x 55230 0 | tee lcwp.tcl

#! /usr/bin/env tclsh
# Copyright (c) 2018, 2021 D. Bohdan
# License: MIT
package require Tcl 8.5-10

namespace eval ::lcwp {
    variable version 0.3.0
    interp alias {} lcwp {} longest-common-word-prefix
}

proc ::lcwp::longest-common-word-prefix {
    s1 s2 {sep {\s+}} {includeTailSep 0}
} {
    if {[string length $s2] > [string length $s1]} {
        set t $s2
        set s2 $s1
        set s1 $t
        unset t
    }

    set offset 0
    set tailSepLength 0
    while 1 {
        lassign [read-word $s1 $offset $sep] label1 \
                                             matchedFramement1 \
                                             matchedSep1
        lassign [read-word $s2 $offset $sep] label2 \
                                             matchedFramement2 \
                                             matchedSep2

        # Handle fragments.
        if {$matchedFramement1 ne $matchedFramement2} {
            break
        }
        set fragmentLength [string length $matchedFramement1]
        incr offset $fragmentLength
        if {$fragmentLength > 0} {
            set tailSepLength 0
        }

        # Handle separators.
        if {$matchedSep1 ne $matchedSep2} {
            break
        }
        incr tailSepLength [string length $matchedSep1]
        incr offset [string length $matchedSep1]

        # Handle string end.
        if {$label1 eq {END} || $label2 eq {END}} {
            break
        }
    }
    if {!$includeTailSep} {
        incr offset -$tailSepLength
    }
    return [string range $s1 0 $offset-1]
}

proc ::lcwp::read-word {s offset sep} {
    if {[regexp -indices -start $offset -- $sep $s match]} {
        lassign $match start end
        set matchedFramement [string range $s $offset $start-1]
        set matchedSep [string range $s $start $end]
        set label MORE
    } else {
        set matchedFramement [string range $s $offset end]
        set matchedSep {}
        set label END
    }
    return [list $label $matchedFramement $matchedSep]
}

proc ::lcwp::replace-prefix {prefix s {replacement { }}} {
    set prefixLen [string length $prefix]
    set replacementLen [string length $replacement]
    set repeats [expr {
        $replacementLen > 0 ?
        $prefixLen / $replacementLen + 1 :
        0
    }]
    set newPrefix [string range [string repeat $replacement $repeats] \
                                0 \
                                $prefixLen-1]
    return $newPrefix[string range $s $prefixLen end]
}

# If this is the main script, run the tests.
if {[info exists argv0] && ([file tail [info script]] eq [file tail $argv0])} {
    package require tcltest
    namespace path ::lcwp

    tcltest::test common-word-prefix-1.1 {simple case} -body {
        lcwp {hello world 1} {hello world 2}
    } -result {hello world}

    tcltest::test common-word-prefix-1.2 {all the same words} -body {
        list [lcwp foo foo] \
             [lcwp {hello world} {hello world}]
    } -result {foo {hello world}}

    tcltest::test common-word-prefix-1.3 {all different words} -body {
        list [lcwp foo bar] \
             [lcwp {foo bar} {baz qux}]
    } -result {{} {}}

    tcltest::test common-word-prefix-1.4 {words sharing a prefix} -body {
        list [lcwp foo food] \
             [lcwp fool food] \
             [lcwp {hello world alpha} {hello world aleph}]
    } -result {{} {} {hello world}}

    tcltest::test common-word-prefix-1.5 {different length} -body {
        list [lcwp {foo bar baz} foo] \
             [lcwp {foo bar baz} {foo bar}] \
             [lcwp {foo bar baz} {foo bar }] \
             [lcwp {foo bar } {foo bar baz}] \
             [lcwp {foo bar} {foo bar baz}] \
             [lcwp foo {foo bar baz}]
    } -result {foo {foo bar} {foo bar} {foo bar} {foo bar} foo}

    tcltest::test common-word-prefix-1.6 includeTailSep -body {
        list [lcwp {hello world 1} {hello world 2} { } 0] \
             [lcwp {hello world 1} {hello world 2} { } 1] \
             [lcwp hello-world-1 hello-world-2 - 0] \
             [lcwp hello-world-1 hello-world-2 - 1]
    } -result {{hello world} {hello world } hello-world hello-world-}

    tcltest::test common-word-prefix-1.7 whitespace-1 -body {
        list [lcwp {foo  bar   1} {foo  bar } { } 0] \
             [lcwp {foo  bar   1} {foo  bar } { } 1] \
             [lcwp {  foo  bar   1} {  foo  bar  } { } 0] \
             [lcwp {  foo  bar   1} {  foo  bar  } { } 1]
    } -result {{foo  bar} {foo  bar } {  foo  bar} {  foo  bar  }}

    tcltest::test replace-prefix-1.1 {default use} -body {
        list [replace-prefix {} {}] \
             [replace-prefix foo foo] \
             [replace-prefix {foo bar} {foo bar baz}]
    } -result {{} {   } {        baz}}

    tcltest::test replace-prefix-1.2 pattern -body {
        list [replace-prefix {} {} 12345] \
             [replace-prefix foo foo 12345] \
             [replace-prefix {foo bar } {foo bar baz} 12345]
    } -result {{} 123 12345123baz}

    # Exit with a nonzero status if there are failed tests.
    set failed [expr {$tcltest::numTests(Failed) > 0}]

    tcltest::cleanupTests
    if {$failed} {
        exit 1
    }
}

Use example

Code

source lcwp.tcl

proc log text {
    set message "[clock format [clock seconds] -gmt 1] $text"
    set prefix [::lcwp::longest-common-word-prefix $::prevLogMessage \
                                                   $message \
                                                   {\s} \
                                                   1]
    puts stderr [::lcwp::replace-prefix $prefix $message $::logPlaceholder]
    set ::prevLogMessage $message
}

foreach logPlaceholder {{ } -=} {
    set prevLogMessage {}
    log "frobnicating file /foo/bar"
    log "frobnicating file /foo/baz"
    log "frobnicating file /foo/qux"
}

Output

Mon Mar 26 16:37:34 GMT 2018 frobnicating file /foo/bar
                                               /foo/baz
                                               /foo/qux
Mon Mar 26 16:37:34 GMT 2018 frobnicating file /foo/bar
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/foo/baz
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/foo/qux