Wasserstein distance

Arjen Markus (24 july 2018) Harm Olthof asked about a Tcl implementation of the so-called Wasserstein distance or "Earth Mover's Distance" as a measure for the discrepancy between two probability distributions. While the full definition is rather mathematical and technical, if you limit yourself to one-dimensional probability distributions in the form of histograms based on uniformly sized bins, the algorithm is almost trivial (see for instance: https://math.stackexchange.com/questions/714476/how-do-you-compute-numerically-the-earth-movers-distance-emd )

Here is an implementation - note that in two or more dimensions a somewhat similar approach is possible:

# stat-wasserstein.tcl --
#     Determine the Wasserstein distance between two probability distributions
#
#     Note:
#     This is an implementation for one-dimensional distributions (or better:
#     non-negative patterns)
#

# LastNonZero --
#     Auxiliary procedure to find the last non-zero entry
#
# Arguments:
#     prob           Probability distribution
#
# Result:
#     Index in the list of the last non-zero entry
#
# Note:
#     To avoid numerical problems any value smaller than 1.0e-10 is considered to
#     be zero
#
proc LastNonZero {prob} {
    set maxidx [expr {[llength $prob] - 1}]

    for {set idx $maxidx} {$idx >= 0} {incr idx -1} {
        if { [lindex $prob $idx] > 1.0e-10 } {
            return $idx
        }
    }

    return -1 ;# No non-zero entry
}

# Normalise --
#     Auxiliary procedure to normalise the probability distribution
#
# Arguments:
#     prob           Probability distribution
#
# Result:
#     Normalised distribution (i.e. the entries sum to 1)
#
# Note:
#     To avoid numerical problems any value smaller than 1.0e-10 is set to zero
#
proc Normalise {prob} {

    set newprob {}
    set sum     0.0

    foreach p $prob {
        set sum [expr {$sum + $p}]
    }

    if { $sum == 0.0 } {
        return -code error "Probability distribution should not consist of only zeroes"
    }

    foreach p $prob {
        lappend newprob [expr {$p > 1.0e-10? ($p/$sum) : 0.0}]
    }

    return $newprob
}

# wasserstein-distance --
#     Determine the Wasserstein distance using a "greedy" algorithm.
#
# Arguments:
#     prob1          First probability distribution, interpreted as a histogram
#                    with uniform bin width
#     prob2          Second probability distribution
#
# Result:
#     Distance between the two distributions
#
proc wasserstein-distance {prob1 prob2} {
    #
    # First step: make sure the histograms have the same length and the
    # same cumulative weight.
    #
    if { [llength $prob1] != [llength $prob2] } {
        return -code error "Lengths of the probability histograms must be the same"
    }

    set prob1 [Normalise $prob1]
    set prob2 [Normalise $prob2]

    set distance 0.0

    #
    # Determine the last non-zero bin - this bin will be shifted to the second
    # distribution
    #
    while {1} {
        set idx1 [LastNonZero $prob1]
        set idx2 [LastNonZero $prob2]

        if { $idx1 < 0 } {
            break ;# We are done
        }

        set bin1 [lindex $prob1 $idx1]
        set bin2 [lindex $prob2 $idx2]

        if { $bin1 <= $bin2 } {
            lset prob1 $idx1 0.0
            lset prob2 $idx2 [expr {$bin2 - $bin1}]
            set distance [expr {$distance + abs($idx2-$idx1) * $bin1}]
        } else {
            lset prob1 $idx1 [expr {$bin1 - $bin2}]
            lset prob2 $idx2 0.0
            set distance [expr {$distance + abs($idx2-$idx1) * $bin2}]
        }
    }

    return $distance
}

# tests --
#

# Almost trivial
set prob1 {0.0 0.0 0.0 1.0}
set prob2 {0.0 0.0 1.0 0.0}

puts "Expected distance: 1"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric:  [wasserstein-distance $prob2 $prob1]"

# Less trivial
set prob1 {0.0 0.75 0.25 0.0}
set prob2 {0.0 0.0  1.0  0.0}

puts "Expected distance: 0.75"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric:  [wasserstein-distance $prob2 $prob1]"

# Shift trivial
set prob1 {0.0 0.1 0.2 0.4 0.2 0.1 0.0 0.0}
set prob2 {0.0 0.0 0.0 0.1 0.2 0.4 0.2 0.1}

puts "Expected distance: 2"
puts "Calculated: [wasserstein-distance $prob1 $prob2]"
puts "Symmetric:  [wasserstein-distance $prob2 $prob1]"