string mapx

Keith Vetter 2017-08-10 : On the string map page, there was a request for a version of string map that takes regular expressions instead of fixed strings. There's a quick and dirty solution on that page but it doesn't handle the tricky case of double substitutions, e.g. string map {a b b c} "a" should return just "b".

Since the request was from over 11 years ago, it's obviously not a high demand item. But I thought it would be a fun intellectual challenge. This isn't the most efficient method--it does too many regex searches and replacements, but the more efficient solution gets too complicated when you start allowing references.

You can even add it into the string ensemble via

namespace ensemble configure string -map [dict merge [namespace ensemble configure string -map] {mapx string_mapx}]

##+##########################################################################
#
# string mapx -- like string map but with regular expressions
#  see also https://wiki.tcl-lang.org/2819
# by Keith Vetter 2017-08-07
#

proc string_mapx {mapping str} {
    if {[llength $mapping] % 2 != 0} {
        return -code error "regex map list unbalanced"
    }
    # Difficulty is we must avoid double substition, meaning we cannot
    # keep matching in replaced text.  e.g. string mapx {A. BB B. CC} AA
    # should return "BB" not "CC". To do so, we maintain a list of
    # <unmatched text>, <matched text> pairs and only do matching on the
    # first part. When done we join all the pieces.

    set pieces [list $str ""]
    foreach {regex replace_with} $mapping {
        set new_pieces {}
        foreach {unmatched already_matched} $pieces {
            set mapping [_string_mapx_one $unmatched $regex $replace_with]
            lset mapping end $already_matched
            lappend new_pieces {*}$mapping
        }
        set pieces $new_pieces
    }

    set result [join $pieces ""]
    return $result
}

proc _string_mapx_one {str regex replace_with} {
    # Replaces all occurrences of regex in str with replace_with
    # The result is a list of prefix and replace_with pairs
    # e.g. _string_mapx_one "xAByACz" {A.} "M" => {x M y M z ""}
    set result {}
    set start 0
    while {1} {
        set n [regexp -start $start -indices $regex $str index]
        if {! $n} break
        lassign $index lo hi
        set prefix [string range $str $start $lo-1]
        set match [string range $str $lo $hi]

        lappend result $prefix [regsub $regex $match $replace_with]
        set start [expr {$hi + 1}]
    }
    lappend result [string range $str $start end] ""
    return $result
}
################################################################
#
# testing code
#
proc test {expected mapping str} {
    set got [string_mapx $mapping $str]
    puts "string_mapx $mapping '$str'"
    if {$got ne $expected} {
        puts "BAD: expected: '$expected'   got: '$got'"
    }
}

test xxMyyMzz {A. M} xxAByyACzz
test xxyyzz {A. ""} xxAByyABzz
test xNxMyNyMzz {A. M CC N} xCCxAByCCyABzz
test bca {a b b c c a} abc
test "NOW IS THE TIME" {{n[^ ]*} NOW {now is} BAD {i.*the} "IS THE" {t.*time} BAD time TIME} "now is the time"

test "dbca dbca" {{(a)(bc)(d)} {\3\2\1}} {abcd abcd}
test "dbcadbca"  {{(.)(..)(.)} {\3\2\1}} {abcdabcd}
test "dbca dbca" {{(a)(..)(.)} {\3\2\1}} {abcd abcd}
return